From a032fd20d108a6da78c5c59596e07cb04ff086d1 Mon Sep 17 00:00:00 2001 From: Peter Westermann Date: Fri, 3 Nov 2017 11:57:09 -0400 Subject: [PATCH 1/3] AR-5858 Added full input to tokenizer --- README.md | 2 ++ .../java/org/elasticsearch/index/analysis/PhoneAnalyzer.java | 3 +-- .../org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java | 3 +-- .../org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java | 3 +-- .../org/elasticsearch/index/analysis/PhoneTermExtractor.java | 1 + 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index ed19d5e..2c41d8e 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,7 @@ Input (with country code): `sip:+13169410766;ext=2233@172.17.10.117:8060` Tokens: ``` +sip:+13169410766;ext=2233@172.17.10.117:8060 sip: 13169410766;ext=2233@172.17.10.117:8060 13169410766;ext=2233 @@ -81,6 +82,7 @@ Input (without a country code): `tel:8177148350` Tokens: ``` +tel:8177148350 tel: 8177148350 8 diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java index 0f81773..5abc0c5 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java @@ -4,7 +4,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; public class PhoneAnalyzer extends Analyzer { @@ -12,6 +11,6 @@ public class PhoneAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new TermExtractorTokenizer(reader, new PhoneTermExtractor()); - return new TokenStreamComponents(tokenizer, new LowerCaseFilter(new UniqueTokenFilter(tokenizer))); + return new TokenStreamComponents(tokenizer, new UniqueTokenFilter(tokenizer)); } } \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java index 0c33eaf..e141e58 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java @@ -3,7 +3,6 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; /** @@ -14,6 +13,6 @@ public class PhoneEmailAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { TermExtractorTokenizer tokenizer = new TermExtractorTokenizer(reader, new PhoneTermExtractor(), new EmailTermExtractor()); - return new TokenStreamComponents(tokenizer, new LowerCaseFilter(new UniqueTokenFilter(tokenizer))); + return new TokenStreamComponents(tokenizer, new UniqueTokenFilter(tokenizer)); } } diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java index c0507b5..e918773 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java @@ -4,13 +4,12 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.LowerCaseFilter; public class PhoneSearchAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new TermExtractorTokenizer(reader, new PhoneSearchTermExtractor()); - return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); + return new TokenStreamComponents(tokenizer, tokenizer); } } diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java b/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java index cf29d97..3934746 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java @@ -15,6 +15,7 @@ public class PhoneTermExtractor implements TermExtractor { @Override public List extractTerms(String input) { List tokens = new ArrayList(); + tokens.add(input); // Rip off the "tel:" or "sip:" prefix if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) { tokens.add(input.substring(0, 4)); From 64dbbbd8c3112044e16a4c3869f4d6180901b654 Mon Sep 17 00:00:00 2001 From: Peter Westermann Date: Fri, 3 Nov 2017 11:57:09 -0400 Subject: [PATCH 2/3] AR-5758 Added full input to tokenizer --- README.md | 4 ++-- .../java/org/elasticsearch/index/analysis/PhoneAnalyzer.java | 3 +-- .../org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java | 3 +-- .../org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java | 3 +-- .../org/elasticsearch/index/analysis/PhoneTermExtractor.java | 1 + src/test/java/tests/PhoneTokenizerIntegrationTest.java | 2 +- 6 files changed, 7 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ed19d5e..8892e9e 100644 --- a/README.md +++ b/README.md @@ -18,8 +18,6 @@ This project provides three analyzers that are intended for different contexts. * The `phone-email` analyzer extends the `phone` analyzer with additional tokenization for email addresses (e.g. generating tokens for the user part and the domain part of an email address). * The `phone-search` analyzer is intended to be used as a `search_analyzer` with one of the other two analyzers used for indexing. It does minimal tokenization: If a term starts with `sip:` or `tel:` it strips this part and generates a token for it. The analyzer also strips a leading `+` from phone numbers. -All three analyzers remove non-unique tokens and transform terms to lowercase. - ## Example inputs @@ -48,6 +46,7 @@ Input (with country code): `sip:+13169410766;ext=2233@172.17.10.117:8060` Tokens: ``` +sip:+13169410766;ext=2233@172.17.10.117:8060 sip: 13169410766;ext=2233@172.17.10.117:8060 13169410766;ext=2233 @@ -81,6 +80,7 @@ Input (without a country code): `tel:8177148350` Tokens: ``` +tel:8177148350 tel: 8177148350 8 diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java index 0f81773..5abc0c5 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java @@ -4,7 +4,6 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; public class PhoneAnalyzer extends Analyzer { @@ -12,6 +11,6 @@ public class PhoneAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new TermExtractorTokenizer(reader, new PhoneTermExtractor()); - return new TokenStreamComponents(tokenizer, new LowerCaseFilter(new UniqueTokenFilter(tokenizer))); + return new TokenStreamComponents(tokenizer, new UniqueTokenFilter(tokenizer)); } } \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java index 0c33eaf..e141e58 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneEmailAnalyzer.java @@ -3,7 +3,6 @@ import java.io.Reader; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; /** @@ -14,6 +13,6 @@ public class PhoneEmailAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { TermExtractorTokenizer tokenizer = new TermExtractorTokenizer(reader, new PhoneTermExtractor(), new EmailTermExtractor()); - return new TokenStreamComponents(tokenizer, new LowerCaseFilter(new UniqueTokenFilter(tokenizer))); + return new TokenStreamComponents(tokenizer, new UniqueTokenFilter(tokenizer)); } } diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java index c0507b5..e918773 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneSearchAnalyzer.java @@ -4,13 +4,12 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.core.LowerCaseFilter; public class PhoneSearchAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String field, Reader reader) { Tokenizer tokenizer = new TermExtractorTokenizer(reader, new PhoneSearchTermExtractor()); - return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); + return new TokenStreamComponents(tokenizer, tokenizer); } } diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java b/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java index cf29d97..3934746 100644 --- a/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneTermExtractor.java @@ -15,6 +15,7 @@ public class PhoneTermExtractor implements TermExtractor { @Override public List extractTerms(String input) { List tokens = new ArrayList(); + tokens.add(input); // Rip off the "tel:" or "sip:" prefix if (input.indexOf("tel:") == 0 || input.indexOf("sip:") == 0) { tokens.add(input.substring(0, 4)); diff --git a/src/test/java/tests/PhoneTokenizerIntegrationTest.java b/src/test/java/tests/PhoneTokenizerIntegrationTest.java index 2f97bc6..e95abed 100644 --- a/src/test/java/tests/PhoneTokenizerIntegrationTest.java +++ b/src/test/java/tests/PhoneTokenizerIntegrationTest.java @@ -147,7 +147,7 @@ public void testSipWithTelephoneExtension() throws ExecutionException, Interrupt @Test public void testSipWithUsername() throws ExecutionException, InterruptedException, IOException { - assertIncludes("sip:JeffSIP@178.12.220.18", Arrays.asList("jeffsip")); + assertIncludes("sip:JeffSIP@178.12.220.18", Arrays.asList("JeffSIP")); } @Test From 3de420be4c2797f170767f4fb47989e9f0a7139c Mon Sep 17 00:00:00 2001 From: Peter Westermann Date: Fri, 3 Nov 2017 12:37:03 -0400 Subject: [PATCH 3/3] AR-5758 Changed to snapshot --- pom.xml | 46 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/pom.xml b/pom.xml index 79844ad..a0f65c6 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ com.inin.analytics elasticsearch-phone jar - 1.0.2 + 1.0.2-SNAPSHOT elasticsearch-phone Elasticsearch Plugin for Phone and SIP Analysis https://github.com/MyPureCloud/elasticsearch-phone @@ -47,6 +47,28 @@ false + + inin-release + ININ Release Repository + https://purecloud.artifactoryonline.com/purecloud/inin-release + + true + + + false + + + + inin-snapshot + ININ Snapshot Repository + https://purecloud.artifactoryonline.com/purecloud/inin-snapshot + + false + + + true + + @@ -126,16 +148,18 @@ - - - ossrh - https://oss.sonatype.org/content/repositories/snapshots - - - ossrh - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - + + + inin-release + ININ Release Repository + https://purecloud.artifactoryonline.com/purecloud/inin-release + + + inin-snapshot + ININ Snapshot Repository + https://purecloud.artifactoryonline.com/purecloud/inin-snapshot + +