Skip to content

Commit

Permalink
NonStrictPhoneTokenizer update - added generateNGrams parameter to wo…
Browse files Browse the repository at this point in the history
…rk in two modes: generate strict match phone tokens or ngrams
  • Loading branch information
chorniyn committed Mar 20, 2017
1 parent 4d66123 commit 7157532
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class NonStrictPhoneTokenizer extends Tokenizer {
private String defaultRegion = "US";
private boolean addCountryCode = false;
private boolean addExtension = false;
private boolean generateNGrams = false;

// The raw input
private String stringToTokenize = null;
Expand Down Expand Up @@ -87,17 +88,16 @@ private void generateTokens() {

String number = parts[0];

// Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of
// telephone number so take it as is
tokens.add(number);

// Let google's libphone try to parse it
PhoneNumberUtil phoneUtil = PhoneNumberUtil.getInstance();
PhoneNumber numberProto;
String countryCode = null;
boolean validPhone = false;
try {
numberProto = phoneUtil.parse(number, defaultRegion);
if (numberProto != null) {
validPhone = phoneUtil.isValidNumber(numberProto);
if (numberProto != null && validPhone) {
// Libphone likes it!
countryCode = String.valueOf(numberProto.getCountryCode());
number = String.valueOf(numberProto.getNationalNumber());
Expand All @@ -107,8 +107,6 @@ private void generateTokens() {
if (addExtension && !StringUtils.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());
}

tokens.add(number);
}
} catch (NumberParseException e) {
// Libphone didn't like it, no biggie. We'll just ngram the number as it is.
Expand All @@ -117,19 +115,43 @@ private void generateTokens() {
}

// ngram the phone number EG 19198243333 produces 9, 91, 919, etc
if (NumberUtils.isNumber(number)) {
for (int count = 1; count <= number.length(); count++) {
String token = number.substring(0, count);
tokens.add(token);
if (countryCode != null) {
// If there was a country code, add more ngrams such that 19198243333 produces 19, 191,
// 1919, etc
tokens.add(countryCode + token);
if (validPhone) {
boolean hasCountryCode = StringUtils.isNotBlank(countryCode);
if (generateNGrams) {
for (int count = 1; count <= number.length(); count++) {
String token = number.substring(0, count);
tokens.add(token);
if (hasCountryCode) {
// If there was a country code, add more ngrams such that 19198243333 produces 19, 191,
// 1919, etc
tokens.add(countryCode + token);
}
}
} else {
tokens.add(number);
if (hasCountryCode) {
tokens.add(countryCode + number);
}
}
} else {
number = cleanNumber(number);
if (generateNGrams) {
for (int count = 1; count <= number.length(); count++) {
String token = number.substring(0, count);
tokens.add(token);
}
} else {
tokens.add(number);
}
tokens.add(number);
}
}

private String cleanNumber(String number) {
//todo: implement
return number;
}

/**
* Read the input into a local variable
*
Expand Down
5 changes: 4 additions & 1 deletion src/test/java/tests/PhoneUtilsTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
import static org.hamcrest.CoreMatchers.notNullValue;
import static org.hamcrest.MatcherAssert.assertThat;


/**
* {@link PhoneNumberUtil#parse(String, String)} test which verifies that the util parses typical valid phones
* from exelare db and doesn't parse typical invalid phones
*/
public class PhoneUtilsTest {

PhoneNumberUtil phoneUtil = PhoneNumberUtil.getInstance();
Expand Down

0 comments on commit 7157532

Please sign in to comment.