From 70576add55f64d99c7c9412a1ed0c0f55f5bb12f Mon Sep 17 00:00:00 2001 From: "drew.dahlke" Date: Tue, 4 Aug 2015 10:45:35 -0400 Subject: [PATCH] inital commit --- LICENCE.md | 191 +++++++++++++++++ LICENSE | 202 ------------------ README.md | 81 ++++++- pom.xml | 176 +++++++++++++++ src/main/assemblies/plugin.xml | 27 +++ .../index/analysis/PhoneAnalyzer.java | 15 ++ .../index/analysis/PhoneTokenizer.java | 157 ++++++++++++++ .../analysis/phone/PhoneAnalyzerProvider.java | 36 ++++ .../analysis/phone/PhoneBinderProcessor.java | 10 + .../plugins/analysis/phone/PhonePlugin.java | 31 +++ src/main/resources/es-plugin.properties | 3 + src/test/java/tests/PhoneIntegrationTest.java | 167 +++++++++++++++ 12 files changed, 892 insertions(+), 204 deletions(-) create mode 100644 LICENCE.md delete mode 100644 LICENSE create mode 100644 pom.xml create mode 100644 src/main/assemblies/plugin.xml create mode 100644 src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java create mode 100644 src/main/java/org/elasticsearch/index/analysis/PhoneTokenizer.java create mode 100644 src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneAnalyzerProvider.java create mode 100644 src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneBinderProcessor.java create mode 100644 src/main/java/org/elasticsearch/plugins/analysis/phone/PhonePlugin.java create mode 100644 src/main/resources/es-plugin.properties create mode 100644 src/test/java/tests/PhoneIntegrationTest.java diff --git a/LICENCE.md b/LICENCE.md new file mode 100644 index 0000000..8ea8fc5 --- /dev/null +++ b/LICENCE.md @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright 2015 Interactive Intelligence + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 8f71f43..0000000 --- a/LICENSE +++ /dev/null @@ -1,202 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - diff --git a/README.md b/README.md index c1c6781..915f40c 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,79 @@ -# elasticsearch-phone -An Elasticsearch Phone Number Analyzer Plugin +# Elasticsearch-Phone + +Indexing phone numbers & sip addresses in lucene is complicated. Most people use ngram tokenizers. We did that for a while with ngram min=3 & max=35, but the result was often 100s of tokens per sip address. Working in a call center focused company we quickly figured out how wasteful that is on the storage front. For us 6/7ths of our indexes were waisted on useless sip address tokens. + +It's a hard problem to regex your way out of. An international phone number often includes a country code, but that can be 1, 2, or 3+ digits. A lot of people have requested elasticsearch integrate google's libphone library into a custom lucene analyzer. It hasn't happened yet, so here's a plugin that attempts to do just that. + +Note: This is a young project we're just starting to testing 8/3/2015. We'll improve as time goes on, but use at your own risk. + +# Building and installing the plugin +mvn package +./bin/plugin --url file:///....elasticsearch-phone/target/releases/elasticsearch-phone-1.0.0.zip --install elasticsearch-phone; + +## Example inputs + +Provide a telephone or sip address prefixed by "tel:" or "sip:" with no spaces or symbols. + +Your indexing template will need to specify the analyzer for the field. EG + "dnis": { + "type": "string", + "analyzer": "phone" + }, + + +Sample allowed inputs (see PhoneIntegrationTest for more) +tel:+441344840400 +tel:+498362930830 +sip:abc@autosbcpc +sip:+13119310462;ext=2244@178.12.10.115:8060 + +## Example tokenization + +INPUT (with country code derived with google liphone) + +sip:+13169410766;ext=2233@172.17.10.117:8060 + +TOKENS + ++13169410766;ext=2233 +1 +2233 +3169410766 +3 +13 +31 +131 +316 +1316 +3169 +13169 +31694 +131694 +316941 +1316941 +3169410 +13169410 +31694107 +131694107 +316941076 +1316941076 +3169410766 +13169410766 + +INPUT (without a country code) + +tel:8177148350 + +TOKENS + +8177148350 +8 +81 +817 +8177 +81771 +817714 +8177148 +81771483 +817714835 +8177148350 \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..44f155e --- /dev/null +++ b/pom.xml @@ -0,0 +1,176 @@ + + + 4.0.0 + + com.inin.analytics + elasticsearch-phone + jar + 1.0.0 + elasticsearch-phone + Elasticsearch Plugin for Phone and SIP Analysis + https://github.com/MyPureCloud/elasticsearch-phone + + UTF-8 + 1.7 + 4.11 + /tmp + + + + + The Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + + + + + https://github.com/MyPureCloud/elasticsearch-phone.git + https://github.com/MyPureCloud/elasticsearch-phone.git + https://github.com/MyPureCloud/elasticsearch-phone.git + + + + + + oss-sonatype + oss-sonatype + https://oss.sonatype.org/content/groups/public + + + + + + com.googlecode.libphonenumber + libphonenumber + 7.0.7 + + + + org.apache.lucene + lucene-test-framework + 4.10.4 + test + + + + org.elasticsearch + elasticsearch + 1.6.0 + + + + com.carrotsearch.randomizedtesting + randomizedtesting-runner + 2.1.11 + test + + + org.elasticsearch + elasticsearch + 1.6.0 + test + test-jar + + + + org.hamcrest + hamcrest-all + 1.3 + test + + + + org.apache.commons + commons-lang3 + 3.4 + + + + commons-io + commons-io + 2.4 + + + + + junit + junit + ${junit.version} + test + + + + + + Drew Dahlke + justin.dahlke@gmail.com + Interactive Intelligence + http://www.inin.com + + + Michael Mulligan + Michael.Mulligan@inin.com + Interactive Intelligence + http://www.inin.com + + + + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + ossrh + https://oss.sonatype.org/service/local/staging/deploy/maven2/ + + + + + + + src/main/resources + true + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + com.carrotsearch.randomizedtesting + junit4-maven-plugin + + + org.apache.maven.plugins + maven-surefire-plugin + + + + org.apache.maven.plugins + maven-source-plugin + + + maven-assembly-plugin + 2.3 + + false + ${project.build.directory}/releases/ + + ${basedir}/src/main/assemblies/plugin.xml + + + + + package + + single + + + + + + + diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml new file mode 100644 index 0000000..7d02d86 --- /dev/null +++ b/src/main/assemblies/plugin.xml @@ -0,0 +1,27 @@ + + + plugin + + zip + + false + + + / + true + true + + org.elasticsearch:elasticsearch + + + + / + true + true + + com.googlecode.libphonenumber:libphonenumber + org.apache.commons:commons-lang3 + + + + diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java new file mode 100644 index 0000000..88a5ffb --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneAnalyzer.java @@ -0,0 +1,15 @@ +package org.elasticsearch.index.analysis; + +import java.io.Reader; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Tokenizer; + +public class PhoneAnalyzer extends Analyzer { + + @Override + protected TokenStreamComponents createComponents(String field, Reader reader) { + Tokenizer tokenizer = new PhoneTokenizer(reader); + return new TokenStreamComponents(tokenizer); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/index/analysis/PhoneTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/PhoneTokenizer.java new file mode 100644 index 0000000..aa21d45 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/analysis/PhoneTokenizer.java @@ -0,0 +1,157 @@ +package org.elasticsearch.index.analysis; + + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.elasticsearch.common.lang3.StringUtils; + +import com.google.i18n.phonenumbers.NumberParseException; +import com.google.i18n.phonenumbers.PhoneNumberUtil; +import com.google.i18n.phonenumbers.Phonenumber.PhoneNumber; + +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +public class PhoneTokenizer extends Tokenizer { + + // The raw input + private String stringToTokenize = null; + + // Position in the tokens array. We build all the tokens and return them one at a time as incrementToken gets called. + private int position = 0; + + /** + * The tokens are determined on the first iteration and then returned one at a time + * thereafter. + */ + + private List tokens = null; + + // The base class grabs the charTermAttribute each time incrementToken returns + protected CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); + + public PhoneTokenizer(Reader reader) { + super(reader); + } + + @Override + public final boolean incrementToken() throws IOException { + // Clear anything that is already saved in this.charTermAttribute + this.charTermAttribute.setEmpty(); + + if(tokens == null) { + // It's the 1st iteration, chop it up into tokens. + generateTokens(); + } + + // Return those tokens + return returnTokensOneAtATime(); + } + + private boolean returnTokensOneAtATime() { + // Token have already been generated. Return them 1 at a time + if(tokens != null) { + if(this.position == tokens.size()) { + // No more tokens + return false; + } + + // return each token, 1 at a time + this.charTermAttribute.append(tokens.get(this.position)); + this.position += 1; + return true; + } + return false; + } + + private void generateTokens() { + String uri = getStringToTokenize(); + + tokens = new ArrayList(); + tokens.add(getStringToTokenize()); + + // Rip off the "tel:" or "sip:" prefix + if (uri.indexOf("tel:") != -1 || uri.indexOf("sip:") != -1) { + uri = uri.substring(4); + } else { + // If it's not formatted at least this correctly then the whole string is 1 token. Sorry, put a tel: or sip: at the beginning so we know how to treat it + tokens.add(getStringToTokenize()); + return; + } + + // Drop anything after @. Most likely there's nothing of interest + String[] parts = StringUtils.split(uri, "@"); + String number = parts[0]; + + // Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of telephone number so take it as is + tokens.add(number); + + // Let google's libphone try to parse it + PhoneNumberUtil phoneUtil = PhoneNumberUtil.getInstance(); + PhoneNumber numberProto = null; + String countryCode = null; + try{ + // ZZ is the generic "I don't know the country code" region. Wish we got country code from our edge, but we don't currently. + numberProto = phoneUtil.parse(number, "ZZ"); + if(numberProto != null) { + // Libphone likes it! + countryCode = String.valueOf(numberProto.getCountryCode()); + number = String.valueOf(numberProto.getNationalNumber()); + + // Add Country code, extension, and the number as tokens + tokens.add(countryCode); + if(!StringUtils.isEmpty(numberProto.getExtension())) { + tokens.add(numberProto.getExtension()); + } + + tokens.add(number); + } + } catch(NumberParseException e) { + // Libphone didn't like it, no biggie. We'll just ngram the number as it is. + } + + // ngram the phone number EG 19198243333 produces 9, 91, 919, etc + if(NumberUtils.isNumber(number)) { + for(int count = 1; count <= number.length(); count++) { + String token = number.substring(0, count); + tokens.add(token); + if(countryCode != null) { + // If there was a country code, add more ngrams such that 19198243333 produces 19, 191, 1919, etc + tokens.add(countryCode + token); + } + } + } + } + + /** + * Read the input into a local variable + * @return + */ + private String getStringToTokenize() { + if(this.stringToTokenize == null) { + try { + this.stringToTokenize = IOUtils.toString(input); + } + catch (IOException e) { + throw new RuntimeException(e); + } + } + return this.stringToTokenize; + } + + /** + * Nuke all state after each use (lucene will re-use an instance of this tokenizer over and over again) + */ + @Override + public final void reset() throws IOException { + super.reset(); + this.position = 0; + tokens = null; + this.stringToTokenize = null; + clearAttributes(); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneAnalyzerProvider.java b/src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneAnalyzerProvider.java new file mode 100644 index 0000000..ff121d1 --- /dev/null +++ b/src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneAnalyzerProvider.java @@ -0,0 +1,36 @@ +package org.elasticsearch.plugins.analysis.phone; + + +import java.io.IOException; + +import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.inject.assistedinject.Assisted; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.Index; +import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; +import org.elasticsearch.index.analysis.PhoneAnalyzer; +import org.elasticsearch.index.settings.IndexSettings; + +public class PhoneAnalyzerProvider extends AbstractIndexAnalyzerProvider { + protected PhoneAnalyzer analyzer = new PhoneAnalyzer(); + public static final String NAME = "phone"; + + @Inject + public PhoneAnalyzerProvider(Index index, @IndexSettings Settings indexSettings, Environment env, @Assisted String name, @Assisted Settings settings) throws IOException { + super(index, indexSettings, name, settings); + } + + public PhoneAnalyzer getAnalyzer() { + return analyzer; + } + + public PhoneAnalyzer get() { + return analyzer; + } + + public static String getName() { + return NAME; + } + +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneBinderProcessor.java b/src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneBinderProcessor.java new file mode 100644 index 0000000..57136c1 --- /dev/null +++ b/src/main/java/org/elasticsearch/plugins/analysis/phone/PhoneBinderProcessor.java @@ -0,0 +1,10 @@ +package org.elasticsearch.plugins.analysis.phone; +import org.elasticsearch.index.analysis.AnalysisModule; + +public class PhoneBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { + + @Override + public void processAnalyzers(AnalyzersBindings analyzersBindings) { + analyzersBindings.processAnalyzer(PhoneAnalyzerProvider.NAME, PhoneAnalyzerProvider.class); + } +} \ No newline at end of file diff --git a/src/main/java/org/elasticsearch/plugins/analysis/phone/PhonePlugin.java b/src/main/java/org/elasticsearch/plugins/analysis/phone/PhonePlugin.java new file mode 100644 index 0000000..7a49968 --- /dev/null +++ b/src/main/java/org/elasticsearch/plugins/analysis/phone/PhonePlugin.java @@ -0,0 +1,31 @@ +package org.elasticsearch.plugins.analysis.phone; + +import org.elasticsearch.common.inject.Module; +import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.plugins.AbstractPlugin; +import org.elasticsearch.plugins.Plugin; + +public class PhonePlugin extends AbstractPlugin implements Plugin { + + /* Return a description of this plugin. */ + public String description() { + return "Makes a best attempt at tokenizing a phone number or sip address"; + } + + /* This is the function that will register our analyzer with Elasticsearch. */ + public void onModule(AnalysisModule analysisModule) { + analysisModule.addProcessor(new PhoneBinderProcessor()); + } + + @Override + public void processModule(Module module) { + if (module instanceof AnalysisModule) { + AnalysisModule analysisModule = (AnalysisModule) module; + analysisModule.addProcessor(new PhoneBinderProcessor()); + } + } + + public String name() { + return "phone-plugin"; + } +} \ No newline at end of file diff --git a/src/main/resources/es-plugin.properties b/src/main/resources/es-plugin.properties new file mode 100644 index 0000000..b061165 --- /dev/null +++ b/src/main/resources/es-plugin.properties @@ -0,0 +1,3 @@ +plugin=org.elasticsearch.plugins.analysis.phone.PhonePlugin +version=.1 +lucene=4.10.4 diff --git a/src/test/java/tests/PhoneIntegrationTest.java b/src/test/java/tests/PhoneIntegrationTest.java new file mode 100644 index 0000000..c36b94e --- /dev/null +++ b/src/test/java/tests/PhoneIntegrationTest.java @@ -0,0 +1,167 @@ +package tests; + +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.CoreMatchers.is; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Locale; +import java.util.concurrent.ExecutionException; + +import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; +import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; +import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse.AnalyzeToken; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.lang3.StringUtils; +import org.elasticsearch.common.settings.ImmutableSettings; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.plugins.PluginsService; +import org.elasticsearch.test.ElasticsearchIntegrationTest; +import org.junit.Before; +import org.junit.Test; + + +@ElasticsearchIntegrationTest.ClusterScope(scope = ElasticsearchIntegrationTest.Scope.SUITE) +public class PhoneIntegrationTest extends ElasticsearchIntegrationTest { + + static { + ClassLoader.getSystemClassLoader().setDefaultAssertionStatus(true); + } + + @Before + @Override + public void setUp() throws Exception { + super.setUp(); + createIndex("test"); + ensureGreen("test"); + final XContentBuilder mapping = jsonBuilder().startObject() + .startObject("type") + .startObject("properties") + .startObject("foo") + .field("type", "string") + .field("analyzer", "phone") + .endObject() + .endObject() + .endObject() + .endObject(); + + + client().admin().indices().preparePutMapping("test").setType("type").setSource(mapping).get(); + ensureGreen("test"); + Locale.setDefault(new Locale("en_US")); + } + + @Override + protected Settings nodeSettings(int nodeOrdinal) { + org.elasticsearch.common.settings.ImmutableSettings.Builder builder = ImmutableSettings.builder() + .put(super.nodeSettings(nodeOrdinal)) + .put("plugins." + PluginsService.LOAD_PLUGIN_FROM_CLASSPATH, true); + return builder.build(); + } + + @Test + public void testPluginIsLoaded() { + NodesInfoResponse infos = client().admin().cluster().prepareNodesInfo().setPlugins(true).execute().actionGet(); + assertThat(infos.getNodes()[0].getPlugins().getInfos().get(0).getName(), is("phone-plugin")); + } + + @Test + public void testEurope() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:+441344840400", Arrays.asList("44", "1344", "1344840400", "441344840400")); + } + + @Test + public void testGermanCastle() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:+498362930830", Arrays.asList("49", "498362930830", "8362930830")); + } + + @Test + public void testBMWofSydney() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:+61293344555", Arrays.asList("61", "293344555", "61293344555")); + } + + @Test + public void coffeeShopInIreland() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:+442890319416", Arrays.asList("44", "289", "2890319416", "442890319416")); + } + + @Test + public void testTelWithCountryCode() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:+17177158163", Arrays.asList("1", "717", "7177", "17177158163")); + } + + @Test + public void testTelWithCountryCode2() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:+12177148350", Arrays.asList("1", "217", "2177", "2177148350","12177148350")); + } + + @Test + public void testMissingCountryCode() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:8177148350", Arrays.asList("817", "8177", "81771", "817714", "8177148350")); + } + + @Test + public void testSipWithNumericUsername() throws ExecutionException, InterruptedException, IOException { + assertIncludes("sip:222@autosbcpc", Arrays.asList("222")); + } + + @Test + public void testTruncatedNumber() throws ExecutionException, InterruptedException, IOException { + assertIncludes("tel:5551234", Arrays.asList("5551234")); + } + + @Test + public void testSipWithAlphabeticUsername() throws ExecutionException, InterruptedException, IOException { + assertIncludes("sip:abc@autosbcpc", Arrays.asList("abc")); + } + + @Test + public void testGarbageInGarbageOut() throws ExecutionException, InterruptedException, IOException { + assertIncludes("test", Arrays.asList("test")); + } + + @Test + public void testSipWithCountryCode() throws ExecutionException, InterruptedException, IOException { + assertIncludes("sip:+14177141363@178.97.105.13;isup-oli=0;pstn-params=808481808882", Arrays.asList("417", "4177", "14177")); + } + + @Test + public void testSipWithTelephoneExtension() throws ExecutionException, InterruptedException, IOException { + assertIncludes("sip:+13169410766;ext=2233@178.17.10.117:8060", Arrays.asList("316", "2233", "1316")); + } + + @Test + public void testSipWithUsername() throws ExecutionException, InterruptedException, IOException { + assertIncludes("sip:JeffSIP@178.12.220.18", Arrays.asList("JeffSIP")); + } + + private void assertIncludes(String ani, List expectedTokens) throws ExecutionException, InterruptedException, IOException { + AnalyzeResponse response = client().admin().indices().prepareAnalyze(ani).setField("foo").setIndex("test").execute().get(); + index("test", "type", "1", "foo", ani); + + + // Verify all the expected tokens are in there + List tokens = new ArrayList(); + for(AnalyzeToken token : response.getTokens()) { + assertFalse(StringUtils.isEmpty(token.getTerm())); + tokens.add(token.getTerm()); + //System.out.println(token.getTerm()); + } + + flush(); + refresh(); + + for(String expectedToken : expectedTokens) { + assertTrue(tokens.contains(expectedToken)); + SearchResponse sr = client().prepareSearch("test").setQuery(QueryBuilders.termQuery("foo", expectedToken)).execute().actionGet(); + assertThat(sr.getHits().getTotalHits(), is(1L)); + sr = client().prepareSearch("test").setQuery(QueryBuilders.termQuery("foo", "bogussearchterm")).execute().actionGet(); + assertThat(sr.getHits().getTotalHits(), is(0l)); + + } + } +}