Skip to content

Commit

Permalink
formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
Lars Neumann committed Dec 7, 2015
1 parent 556829a commit 5f1c6a8
Show file tree
Hide file tree
Showing 5 changed files with 162 additions and 158 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import org.apache.lucene.analysis.Tokenizer;

public class PhoneAnalyzer extends Analyzer {

@Override
protected TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new PhoneTokenizer();
Expand Down
289 changes: 146 additions & 143 deletions src/main/java/org/elasticsearch/index/analysis/PhoneTokenizer.java
Original file line number Diff line number Diff line change
@@ -1,159 +1,162 @@
package org.elasticsearch.index.analysis;


import java.io.IOException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

import com.google.i18n.phonenumbers.NumberParseException;
import com.google.i18n.phonenumbers.PhoneNumberUtil;
import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber;

public class PhoneTokenizer extends Tokenizer {

// The raw input
private String stringToTokenize = null;

// Position in the tokens array. We build all the tokens and return them one at a time as incrementToken gets called.
private int position = 0;

/**
* The tokens are determined on the first iteration and then returned one at a time
* thereafter.
*/

private List<String> tokens = null;

// The base class grabs the charTermAttribute each time incrementToken returns
protected CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);

public PhoneTokenizer() {
}

@Override
public final boolean incrementToken() throws IOException {
// Clear anything that is already saved in this.charTermAttribute
this.charTermAttribute.setEmpty();

if(tokens == null) {
// It's the 1st iteration, chop it up into tokens.
generateTokens();
}

// Return those tokens
return returnTokensOneAtATime();
}

private boolean returnTokensOneAtATime() {
// Token have already been generated. Return them 1 at a time
if(tokens != null) {
if(this.position == tokens.size()) {
// No more tokens
return false;
}

// return each token, 1 at a time
this.charTermAttribute.append(tokens.get(this.position));
this.position += 1;
return true;
}
return false;
}

private void generateTokens() {
String uri = getStringToTokenize();

tokens = new ArrayList<String>();
tokens.add(getStringToTokenize());

// Rip off the "tel:" or "sip:" prefix
if (uri.indexOf("tel:") != -1 || uri.indexOf("sip:") != -1) {
uri = uri.substring(4);
} else {
// If it's not formatted at least this correctly then the whole string is 1 token. Sorry, put a tel: or sip: at the beginning so we know how to treat it
tokens.add(getStringToTokenize());
return;
}

// Drop anything after @. Most likely there's nothing of interest
String[] parts = StringUtils.split(uri, "@");
if(parts.length == 0) {
return;
}

String number = parts[0];

// Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of telephone number so take it as is
tokens.add(number);

// Let google's libphone try to parse it
PhoneNumberUtil phoneUtil = PhoneNumberUtil.getInstance();
PhoneNumber numberProto = null;
String countryCode = null;
try{
// ZZ is the generic "I don't know the country code" region. Google's libphone library will try to infer it.
numberProto = phoneUtil.parse(number, "ZZ");
if(numberProto != null) {
// Libphone likes it!
countryCode = String.valueOf(numberProto.getCountryCode());
number = String.valueOf(numberProto.getNationalNumber());

// Add Country code, extension, and the number as tokens
tokens.add(countryCode);
if(!StringUtils.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());
}

tokens.add(number);
}
} catch(NumberParseException e) {
// Libphone didn't like it, no biggie. We'll just ngram the number as it is.
}

// ngram the phone number EG 19198243333 produces 9, 91, 919, etc
if(NumberUtils.isNumber(number)) {
for(int count = 1; count <= number.length(); count++) {
String token = number.substring(0, count);
tokens.add(token);
if(countryCode != null) {
// If there was a country code, add more ngrams such that 19198243333 produces 19, 191, 1919, etc
tokens.add(countryCode + token);
}
}
}
}

/**
* Read the input into a local variable
* @return
*/
private String getStringToTokenize() {
if(this.stringToTokenize == null) {
try {
this.stringToTokenize = IOUtils.toString(input);
}
catch (IOException e) {
throw new RuntimeException(e);
}
}
return this.stringToTokenize;
}

/**
* Nuke all state after each use (lucene will re-use an instance of this tokenizer over and over again)
*/
@Override
public final void reset() throws IOException {
super.reset();
this.position = 0;
tokens = null;
this.stringToTokenize = null;
clearAttributes();
}
// The raw input
private String stringToTokenize = null;

// Position in the tokens array. We build all the tokens and return them one at a time as incrementToken
// gets called.
private int position = 0;

/**
* The tokens are determined on the first iteration and then returned one at a time thereafter.
*/

private List<String> tokens = null;

// The base class grabs the charTermAttribute each time incrementToken returns
protected CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);

public PhoneTokenizer() {
}

@Override
public final boolean incrementToken() throws IOException {
// Clear anything that is already saved in this.charTermAttribute
this.charTermAttribute.setEmpty();

if (tokens == null) {
// It's the 1st iteration, chop it up into tokens.
generateTokens();
}

// Return those tokens
return returnTokensOneAtATime();
}

private boolean returnTokensOneAtATime() {
// Token have already been generated. Return them 1 at a time
if (tokens != null) {
if (this.position == tokens.size()) {
// No more tokens
return false;
}

// return each token, 1 at a time
this.charTermAttribute.append(tokens.get(this.position));
this.position += 1;
return true;
}
return false;
}

private void generateTokens() {
String uri = getStringToTokenize();

tokens = new ArrayList<String>();
tokens.add(getStringToTokenize());

// Rip off the "tel:" or "sip:" prefix
if (uri.indexOf("tel:") != -1 || uri.indexOf("sip:") != -1) {
uri = uri.substring(4);
} else {
// If it's not formatted at least this correctly then the whole string is 1 token. Sorry, put a
// tel: or sip: at the beginning so we know how to treat it
tokens.add(getStringToTokenize());
return;
}

// Drop anything after @. Most likely there's nothing of interest
String[] parts = StringUtils.split(uri, "@");
if (parts.length == 0) {
return;
}

String number = parts[0];

// Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of
// telephone number so take it as is
tokens.add(number);

// Let google's libphone try to parse it
PhoneNumberUtil phoneUtil = PhoneNumberUtil.getInstance();
PhoneNumber numberProto = null;
String countryCode = null;
try {
// ZZ is the generic "I don't know the country code" region. Google's libphone library will try to
// infer it.
numberProto = phoneUtil.parse(number, "ZZ");
if (numberProto != null) {
// Libphone likes it!
countryCode = String.valueOf(numberProto.getCountryCode());
number = String.valueOf(numberProto.getNationalNumber());

// Add Country code, extension, and the number as tokens
tokens.add(countryCode);
if (!StringUtils.isEmpty(numberProto.getExtension())) {
tokens.add(numberProto.getExtension());
}

tokens.add(number);
}
} catch (NumberParseException e) {
// Libphone didn't like it, no biggie. We'll just ngram the number as it is.
}

// ngram the phone number EG 19198243333 produces 9, 91, 919, etc
if (NumberUtils.isNumber(number)) {
for (int count = 1; count <= number.length(); count++) {
String token = number.substring(0, count);
tokens.add(token);
if (countryCode != null) {
// If there was a country code, add more ngrams such that 19198243333 produces 19, 191,
// 1919, etc
tokens.add(countryCode + token);
}
}
}
}

/**
* Read the input into a local variable
*
* @return
*/
private String getStringToTokenize() {
if (this.stringToTokenize == null) {
try {
this.stringToTokenize = IOUtils.toString(input);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return this.stringToTokenize;
}

/**
* Nuke all state after each use (lucene will re-use an instance of this tokenizer over and over again)
*/
@Override
public final void reset() throws IOException {
super.reset();
this.position = 0;
tokens = null;
this.stringToTokenize = null;
clearAttributes();
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
package org.elasticsearch.plugins.analysis.phone;


import java.io.IOException;

import org.elasticsearch.common.inject.Inject;
Expand All @@ -12,19 +11,20 @@
import org.elasticsearch.index.analysis.PhoneAnalyzer;

public class PhoneAnalyzerProvider extends AbstractIndexAnalyzerProvider<PhoneAnalyzer> {

protected PhoneAnalyzer analyzer = new PhoneAnalyzer();

public static final String NAME = "phone";

@Inject
public PhoneAnalyzerProvider(Index index, Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) throws IOException {
super(index, indexSettings, name, settings);
}
@Inject
public PhoneAnalyzerProvider(Index index, Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings)
throws IOException {
super(index, indexSettings, name, settings);
}

@Override
public PhoneAnalyzer get() {
return analyzer;
}
@Override
public PhoneAnalyzer get() {
return analyzer;
}

}
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package org.elasticsearch.plugins.analysis.phone;
import org.elasticsearch.index.analysis.AnalysisModule;

import org.elasticsearch.index.analysis.AnalysisModule;

public class PhoneBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {

@Override
public void processAnalyzers(AnalyzersBindings analyzersBindings) {
analyzersBindings.processAnalyzer(PhoneAnalyzerProvider.NAME, PhoneAnalyzerProvider.class);
analyzersBindings.processAnalyzer(PhoneAnalyzerProvider.NAME, PhoneAnalyzerProvider.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
public class PhonePlugin extends Plugin {

public static final String NAME = "phone-plugin";

/* Return a description of this plugin. */
public String description() {
return "Makes a best attempt at tokenizing a phone number or sip address";
Expand Down

0 comments on commit 5f1c6a8

Please sign in to comment.