From af80705b8c9c65dc9c2ce9303d098044be0bfff3 Mon Sep 17 00:00:00 2001 From: Gentaro Watanabe Date: Fri, 19 Sep 2014 15:20:42 +0900 Subject: [PATCH] import publication-ready version --- .gitignore | 26 + LICENSE | 458 +++++++++++++ README.md | 77 +++ Versioning.md | 26 + data/shogun_data.json | 44 ++ gateway/assembly.sbt | 20 + gateway/build.sbt | 29 + gateway/project/assembly.sbt | 1 + gateway/project/plugins.sbt | 1 + gateway/src/main/resources/log4j.xml | 25 + .../jubaql_server/gateway/GatewayPlan.scala | 370 +++++++++++ .../jubaql_server/gateway/JubaQLGateway.scala | 119 ++++ .../jubaql_server/gateway/json/Query.scala | 18 + .../gateway/json/QueryToProcessor.scala | 18 + .../jubaql_server/gateway/json/Register.scala | 18 + .../gateway/json/SessionId.scala | 18 + .../gateway/json/Unregister.scala | 18 + .../src/test/resources/processor-logfile.jar | Bin 0 -> 728 bytes .../jubaql_server/gateway/GatewayServer.scala | 35 + .../jubaql_server/gateway/JubaQLSpec.scala | 79 +++ .../jubaql_server/gateway/LoginSpec.scala | 56 ++ .../gateway/ProcessorAndGatewayServer.scala | 79 +++ .../jubaql_server/gateway/RegisterSpec.scala | 120 ++++ .../gateway/UnregisterSpec.scala | 113 ++++ processor/assembly.sbt | 61 ++ processor/build.sbt | 122 ++++ processor/project/assembly.sbt | 1 + processor/project/deptree.sbt | 1 + processor/project/plugins.sbt | 1 + .../src/main/resources/core-site.xml.dist | 128 ++++ .../src/main/resources/hdfs-site.xml.dist | 68 ++ processor/src/main/resources/log4j.xml | 59 ++ .../src/main/resources/yarn-site.xml.dist | 136 ++++ .../dstream/OrderedFileInputDStream.scala | 272 ++++++++ .../processor/HandleExceptions.scala | 41 ++ .../processor/HybridProcessor.scala | 453 +++++++++++++ .../jubaql_server/processor/JubaQLAST.scala | 47 ++ .../processor/JubaQLParser.scala | 176 +++++ .../processor/JubaQLProcessor.scala | 186 ++++++ .../processor/JubaQLService.scala | 626 ++++++++++++++++++ .../processor/MaxOptionAccumulatorParam.scala | 41 ++ .../processor/RegistrationHandler.scala | 58 ++ .../processor/StringWrapper.scala | 18 + .../processor/json/AnomalyScore.scala | 18 + .../processor/json/ClassifierPrediction.scala | 18 + .../processor/json/ClassifierResult.scala | 18 + .../processor/json/DatumResult.scala | 18 + .../processor/json/Register.scala | 18 + .../processor/json/Unregister.scala | 18 + .../processor/logical/RegisterAsTable.scala | 28 + .../processor/updater/Anomaly.scala | 40 ++ .../processor/updater/Classifier.scala | 52 ++ .../processor/updater/HttpClientPerJvm.scala | 56 ++ .../processor/updater/Recommender.scala | 47 ++ .../processor/updater/Updater.scala | 54 ++ .../src/test/resources/core-site.xml.dist | 128 ++++ processor/src/test/resources/dummydata/1.json | 2 + processor/src/test/resources/dummydata/2.json | 2 + .../src/test/resources/hdfs-site.xml.dist | 68 ++ processor/src/test/resources/kafka.xml.dist | 6 + processor/src/test/resources/lof.json | 32 + processor/src/test/resources/log4j.xml | 59 ++ .../src/test/resources/npb_similar_player.csv | 144 ++++ .../test/resources/npb_similar_player.json | 16 + .../resources/npb_similar_player_data.json | 144 ++++ processor/src/test/resources/shogun.json | 20 + processor/src/test/resources/shogun_data.json | 44 ++ .../src/test/resources/yarn-site.xml.dist | 136 ++++ .../processor/HasKafkaPath.scala | 40 ++ .../processor/HybridProcessorSpec.scala | 288 ++++++++ .../processor/JubaQLParserSpec.scala | 128 ++++ .../processor/JubaQLProcessorSpec.scala | 564 ++++++++++++++++ .../processor/JubaQLServiceHelperSpec.scala | 92 +++ .../LocalJubatusApplicationSpec.scala | 159 +++++ .../jubaql_server/processor/MockServer.scala | 36 + .../processor/RegistrationSpec.scala | 189 ++++++ .../jubaql_server/processor/TestTags.scala | 40 ++ processor/start-script/.keep | 0 78 files changed, 6985 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 Versioning.md create mode 100644 data/shogun_data.json create mode 100644 gateway/assembly.sbt create mode 100644 gateway/build.sbt create mode 100644 gateway/project/assembly.sbt create mode 100644 gateway/project/plugins.sbt create mode 100644 gateway/src/main/resources/log4j.xml create mode 100644 gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala create mode 100644 gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala create mode 100644 gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Query.scala create mode 100644 gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/QueryToProcessor.scala create mode 100644 gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Register.scala create mode 100644 gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/SessionId.scala create mode 100644 gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Unregister.scala create mode 100644 gateway/src/test/resources/processor-logfile.jar create mode 100644 gateway/src/test/scala/us/jubat/jubaql_server/gateway/GatewayServer.scala create mode 100644 gateway/src/test/scala/us/jubat/jubaql_server/gateway/JubaQLSpec.scala create mode 100644 gateway/src/test/scala/us/jubat/jubaql_server/gateway/LoginSpec.scala create mode 100644 gateway/src/test/scala/us/jubat/jubaql_server/gateway/ProcessorAndGatewayServer.scala create mode 100644 gateway/src/test/scala/us/jubat/jubaql_server/gateway/RegisterSpec.scala create mode 100644 gateway/src/test/scala/us/jubat/jubaql_server/gateway/UnregisterSpec.scala create mode 100644 processor/assembly.sbt create mode 100644 processor/build.sbt create mode 100644 processor/project/assembly.sbt create mode 100644 processor/project/deptree.sbt create mode 100644 processor/project/plugins.sbt create mode 100644 processor/src/main/resources/core-site.xml.dist create mode 100644 processor/src/main/resources/hdfs-site.xml.dist create mode 100644 processor/src/main/resources/log4j.xml create mode 100644 processor/src/main/resources/yarn-site.xml.dist create mode 100644 processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/MaxOptionAccumulatorParam.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/RegistrationHandler.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/StringWrapper.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/json/AnomalyScore.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierPrediction.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierResult.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/json/DatumResult.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/json/Register.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/json/Unregister.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/logical/RegisterAsTable.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Anomaly.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Classifier.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/updater/HttpClientPerJvm.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Recommender.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Updater.scala create mode 100644 processor/src/test/resources/core-site.xml.dist create mode 100644 processor/src/test/resources/dummydata/1.json create mode 100644 processor/src/test/resources/dummydata/2.json create mode 100644 processor/src/test/resources/hdfs-site.xml.dist create mode 100644 processor/src/test/resources/kafka.xml.dist create mode 100644 processor/src/test/resources/lof.json create mode 100644 processor/src/test/resources/log4j.xml create mode 100644 processor/src/test/resources/npb_similar_player.csv create mode 100644 processor/src/test/resources/npb_similar_player.json create mode 100644 processor/src/test/resources/npb_similar_player_data.json create mode 100644 processor/src/test/resources/shogun.json create mode 100644 processor/src/test/resources/shogun_data.json create mode 100644 processor/src/test/resources/yarn-site.xml.dist create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/HasKafkaPath.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/HybridProcessorSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLParserSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLProcessorSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLServiceHelperSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/LocalJubatusApplicationSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/MockServer.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/RegistrationSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/TestTags.scala create mode 100644 processor/start-script/.keep diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..be6a063 --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +*.class +*.log + +# sbt specific +dist/* +target/ +lib_managed/ +src_managed/ +project/boot/ +project/plugins/project/ + +# Scala-IDE specific +.scala_dependencies +.idea + +# emacs specific +*~ + +# auto-generated files +processor/start-script/run + +# config files +core-site.xml +hdfs-site.xml +yarn-site.xml +kafka.xml diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..e9ab0b3 --- /dev/null +++ b/LICENSE @@ -0,0 +1,458 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it becomes +a de-facto standard. To achieve this, non-free programs must be +allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control compilation +and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at + least three years, to give the same user the materials + specified in Subsection 6a, above, for a charge no more + than the cost of performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under any +particular circumstance, the balance of the section is intended to apply, +and the section as a whole is intended to apply in other circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License may add +an explicit geographical distribution limitation excluding those countries, +so that distribution is permitted only in or among countries not thus +excluded. In such case, this License incorporates the limitation as if +written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/README.md b/README.md new file mode 100644 index 0000000..44f1780 --- /dev/null +++ b/README.md @@ -0,0 +1,77 @@ +JubaQL +====== + +How to get started with JubaQL +------------------------------ + +### Development Setup + +* Get a Hadoop-enabled version of Spark 1.1.1: + `wget http://d3kbcqa49mib13.cloudfront.net/spark-1.1.1-bin-hadoop2.4.tgz` + and unpack it somewhere: + `tar -xzf spark-1.1.1-bin-hadoop2.4.tgz && export SPARK_DIST="$(pwd)/spark-1.1.1-bin-hadoop2.4/"` +* Install Jubatus. +* Get JubaQLClient and JubaQLServer (consists of JubaQLProcessor and JubaQLGateway): + `git clone https://github.com/jubatus/jubaql-client.git` + `git clone https://github.com/jubatus/jubaql-server.git` +* Build the JubaQL components: + * JubaQLClient: + `cd jubaql-client && sbt start-script && cd ..` + * JubaQLProcessor: + `cd jubaql-server/processor && sbt assembly && cd ../..` + * JubaQLGateway: + `cd jubaql-server/gateway && sbt assembly && cd ../..` +* Start the JubaQLGateway: + `cd jubaql-server && java -Dspark.distribution="$SPARK_DIST" -Djubaql.processor.fatjar=processor/target/scala-2.10/jubaql-processor-assembly-1.2.0.jar -jar gateway/target/scala-2.10/jubaql-gateway-assembly-1.2.0.jar -i 127.0.0.1` +* In a different shell, start the JubaQLClient: + `./jubaql-client/target/start` +* You will see the prompt `jubaql>` in the shell and you will in fact be able to type your commands there, but until the JubaQLProcessor is up and running correctly, you will get the message "Unexpected response status: 503". + +In order to test that your setup is working correctly, you can do a simple classification using the data from the [shogun example](https://github.com/jubatus/jubatus-example/tree/master/shogun). Run the following JubaQL commands in the client: + +* `CREATE CLASSIFIER MODEL test WITH (label: "label", datum: "name") config = '{"method": "AROW","converter": { "num_filter_types": {}, "num_filter_rules": [], "string_filter_types": {}, "string_filter_rules": [], "num_types": {}, "num_rules": [],"string_types": {"unigram": { "method": "ngram", "char_num": "1" }},"string_rules": [{ "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } ]},"parameter": {"regularization_weight" : 1.0}}'` +* `CREATE DATASOURCE shogun (label string, name string) FROM (STORAGE: "file://data/shogun_data.json")` +* `UPDATE MODEL test USING train FROM shogun` +* `ANALYZE '{"name": "慶喜"}' BY MODEL test USING classify` +* `SHUTDOWN` + +The JSON returned by the `ANALYZE` statement should indicate that the label "徳川" has the highest score. + +### Run on YARN with local gateway + +* Set up a Hadoop cluster with YARN and HDFS in place. +* Install Jubatus on all cluster nodes. +* Get JubaQL and compile it as described above. (This time, Jubatus is not required locally.) +* Install the [Jubatus on YARN](https://github.com/jubatus/jubatus-on-yarn) libraries in HDFS as described in [the instructions](https://github.com/jubatus/jubatus-on-yarn/blob/master/document/%E3%83%93%E3%83%AB%E3%83%89%E3%83%BB%E5%88%A9%E7%94%A8%E6%89%8B%E9%A0%86%E6%9B%B8.md#%E5%AE%9F%E8%A1%8C%E3%81%AB%E5%BF%85%E8%A6%81%E3%81%AA%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%81%AE%E6%BA%96%E5%82%99). Make sure that the HDFS directory `/jubatus-on-yarn/application-master/jubaconfig/` exists and is writeable by the user running the JubaQLProcessor application. +* To test the setup, also copy the file `shogun-data.json` from the JubaQL source tree's `data/` directory to `/jubatus-on-yarn/sample/shogun_data.json` in HDFS. +* Copy the files `core-site.xml`, `yarn-site.xml`, `hdfs-site.xml` containing your Hadoop setup description from one of your cluster nodes to some directory and point the environment variable `HADOOP_CONF_DIR` to that directory. +* Get your local computer's IP address that points towards the cluster. On Linux, given the IP address of one of your cluster nodes, this should be possible with something like: + `export MY_IP=$(ip route get 12.34.56.78 | grep -Po 'src \K.+')` + Make sure that this IP address can be connected to from the cluster nodes and no firewall rules etc. are blocking access. +* Get the addresses of your Zookeeper nodes and concatenate their `host:port` locations with a comma: + `export MY_ZOOKEEPER=zk1:2181,zk2:2181` +* Start the JubaQLGateway: + `cd jubaql-server` + `java -Drun.mode=production -Djubaql.zookeeper=$MY_ZOOKEEPER -Dspark.distribution="$SPARK_DIST" -Djubaql.processor.fatjar=processor/target/scala-2.10/jubaql-processor-assembly-1.2.0.jar -jar gateway/target/scala-2.10/jubaql-gateway-assembly-1.2.0.jar -i $MY_IP` +* In a different shell, start the JubaQLClient: + `./jubaql-client/target/start` +* You will see the prompt `jubaql>` in the shell and you will in fact be able to type your commands there, but until the JubaQLProcessor is up and running correctly, you will get the message "Unexpected response status: 503". + +In order to test that your setup is working correctly, you can do a simple classification using the `shogun-data.json` file you copied to HDFS before. Run the following JubaQL commands in the client: + +* `CREATE CLASSIFIER MODEL test WITH (label: "label", datum: "name") config = '{"method": "AROW","converter": { "num_filter_types": {}, "num_filter_rules": [], "string_filter_types": {}, "string_filter_rules": [], "num_types": {}, "num_rules": [],"string_types": {"unigram": { "method": "ngram", "char_num": "1" }},"string_rules": [{ "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } ]},"parameter": {"regularization_weight" : 1.0}}'` +* `CREATE DATASOURCE shogun (label string, name string) FROM (STORAGE: "hdfs:///jubatus-on-yarn/sample/shogun_data.json")` +* `UPDATE MODEL test USING train FROM shogun` +* `ANALYZE '{"name": "慶喜"}' BY MODEL test USING classify` +* `SHUTDOWN` + +The JSON returned by the `ANALYZE` statement should indicate that the label "徳川" has the highest score. + +Note: +* When the JubaQLProcessor is started using `spark-submit` as outlined above, it will first upload the `spark-assembly-1.1.1-hadoop2.4.0.jar` and `jubaql-processor-assembly-1.2.0.jar` to the cluster and add them to HDFS, from where they will be downloaded by each executor. +* It is possible to skip the upload of the Spark libraries by copying the Spark jar file to HDFS manually and adding the parameter `-Dspark.yarn.jar=hdfs:///path/to/spark-assembly-1.1.1-hadoop2.4.0.jar` when starting the JubaQLGateway. +* In theory, it is also possible to do the same for the JubaQLProcessor application jar file. However, at the moment we rely on extracting a `log4j.xml` file from that jar locally before upload, so there is no support for also storing that file in HDFS, yet. + +### Run on YARN with remote gateway + +In general, this setup is very similar to the setup in the previous section. The only difference is that the execution of the gateway takes place on a remote host. Therefore, the jar files for JubaQLProcessor and JubaQLGateway as well as the Hadoop configuration files must be copied there and the JubaQLGateway started there. Also, pass the `-h hostname` parameter to the JubaQLClient to connect to the remote server. diff --git a/Versioning.md b/Versioning.md new file mode 100644 index 0000000..2c43d06 --- /dev/null +++ b/Versioning.md @@ -0,0 +1,26 @@ +It seems like the following workflow is actually able to realize the desired versioning in git: + +* Version on the develop branch is + `+dev`. +* When releasing a new version from the develop branch: + * Change version numbers in files to + `` + and commit. + * Create the branch + `releases/v` + from develop. + * On develop, change version numbers in files to + `+dev` + and commit. + * When release is complete, merge the + `releases/v` + branch into master and tag it. + +This way, + +* it is easy to know which release version the current dev code is + based on, +* a jar file created on the develop-based branch is easily detectable + as development version, +* while the commits on the release branch can still be merged back to + the develop branch. diff --git a/data/shogun_data.json b/data/shogun_data.json new file mode 100644 index 0000000..8096d19 --- /dev/null +++ b/data/shogun_data.json @@ -0,0 +1,44 @@ +{"label":"徳川","name":"家康"} +{"label":"徳川","name":"秀忠"} +{"label":"徳川","name":"家光"} +{"label":"徳川","name":"家綱"} +{"label":"徳川","name":"綱吉"} +{"label":"徳川","name":"家宣"} +{"label":"徳川","name":"家継"} +{"label":"徳川","name":"吉宗"} +{"label":"徳川","name":"家重"} +{"label":"徳川","name":"家治"} +{"label":"徳川","name":"家斉"} +{"label":"徳川","name":"家慶"} +{"label":"徳川","name":"家定"} +{"label":"徳川","name":"家茂"} +{"label":"足利","name":"尊氏"} +{"label":"足利","name":"義詮"} +{"label":"足利","name":"義満"} +{"label":"足利","name":"義持"} +{"label":"足利","name":"義量"} +{"label":"足利","name":"義教"} +{"label":"足利","name":"義勝"} +{"label":"足利","name":"義政"} +{"label":"足利","name":"義尚"} +{"label":"足利","name":"義稙"} +{"label":"足利","name":"義澄"} +{"label":"足利","name":"義稙"} +{"label":"足利","name":"義晴"} +{"label":"足利","name":"義輝"} +{"label":"足利","name":"義栄"} +{"label":"北条","name":"時政"} +{"label":"北条","name":"義時"} +{"label":"北条","name":"泰時"} +{"label":"北条","name":"経時"} +{"label":"北条","name":"時頼"} +{"label":"北条","name":"長時"} +{"label":"北条","name":"政村"} +{"label":"北条","name":"時宗"} +{"label":"北条","name":"貞時"} +{"label":"北条","name":"師時"} +{"label":"北条","name":"宗宣"} +{"label":"北条","name":"煕時"} +{"label":"北条","name":"基時"} +{"label":"北条","name":"高時"} +{"label":"北条","name":"貞顕"} diff --git a/gateway/assembly.sbt b/gateway/assembly.sbt new file mode 100644 index 0000000..c8189c7 --- /dev/null +++ b/gateway/assembly.sbt @@ -0,0 +1,20 @@ +import AssemblyKeys._ + +assemblySettings + +test in assembly := {} + +jarName in assembly := "jubaql-gateway-assembly-" + version.value + ".jar" + +/// We MUST include Scala libraries, otherwise scalalogging won't +/// be included: +// assemblyOption in assembly ~= { +// _.copy(includeScala = false) +// } + +mergeStrategy in assembly <<= (mergeStrategy in assembly) { + (old) => { + case x if x.startsWith("META-INF/io.netty.versions.properties") => MergeStrategy.last + case x => old(x) + } +} diff --git a/gateway/build.sbt b/gateway/build.sbt new file mode 100644 index 0000000..bb46465 --- /dev/null +++ b/gateway/build.sbt @@ -0,0 +1,29 @@ +name := "JubaQL Gateway" + +version := "1.2.0" + +// use an older version than necessary to use the same set of dependencies +// across projects +scalaVersion := "2.10.4" + +libraryDependencies ++= Seq( + // logging + "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", + "org.slf4j" % "slf4j-api" % "1.7.7", + "org.slf4j" % "slf4j-log4j12" % "1.7.7", + // HTTP server interface + "net.databinder" %% "unfiltered-filter" % "0.8.2", + "net.databinder" %% "unfiltered-netty-server" % "0.8.2", + "net.databinder" %% "unfiltered-json4s" % "0.8.2", + "org.json4s" %% "json4s-ext" % "3.2.10", + // making HTTP requests + "net.databinder.dispatch" %% "dispatch-core" % "0.11.2", + // parsing of program arguments + "com.github.scopt" %% "scopt" % "3.2.0", + // testing + "org.scalatest" %% "scalatest" % "2.2.1" +) + +// disable parallel test execution to avoid BindException when mocking +// HTTP servers +parallelExecution in Test := false diff --git a/gateway/project/assembly.sbt b/gateway/project/assembly.sbt new file mode 100644 index 0000000..54c3252 --- /dev/null +++ b/gateway/project/assembly.sbt @@ -0,0 +1 @@ +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") diff --git a/gateway/project/plugins.sbt b/gateway/project/plugins.sbt new file mode 100644 index 0000000..14a6ca1 --- /dev/null +++ b/gateway/project/plugins.sbt @@ -0,0 +1 @@ +logLevel := Level.Warn \ No newline at end of file diff --git a/gateway/src/main/resources/log4j.xml b/gateway/src/main/resources/log4j.xml new file mode 100644 index 0000000..8e69dd3 --- /dev/null +++ b/gateway/src/main/resources/log4j.xml @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala new file mode 100644 index 0000000..3f41df9 --- /dev/null +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala @@ -0,0 +1,370 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway + +import com.typesafe.scalalogging.slf4j.LazyLogging +import org.jboss.netty.handler.execution.MemoryAwareThreadPoolExecutor +import unfiltered.response._ +import unfiltered.request._ +import unfiltered.netty.{cycle, ServerErrorResponse} +import us.jubat.jubaql_server.gateway.json.{Unregister, Register, Query, SessionId, QueryToProcessor} +import scala.collection.mutable +import org.json4s.DefaultFormats +import org.json4s.native.Serialization.write +import scala.util.Random +import scala.util.{Try, Success, Failure} +import java.io._ +import dispatch._ +import dispatch.Defaults._ +import java.util.jar.JarFile +import java.nio.file.{StandardCopyOption, Files} + +// A Netty plan using async IO +// cf. . +// If it is not sharable, then it can handle just one request; otherwise +// this object will be reused for many requests. +@io.netty.channel.ChannelHandler.Sharable +class GatewayPlan(ipAddress: String, port: Int, + envpForProcessor: Array[String], runMode: RunMode, + sparkDistribution: String, fatjar: String) + extends cycle.Plan + /* With cycle.SynchronousExecution, there is a group of N (16?) threads + (named "nioEventLoopGroup-5-*") that will process N requests in + parallel, the rest will be queued for later execution. This can + lead to timeouts, but will prevent too many threads running in parallel. + The documentation says: "Evaluates the intent and its response function + on an I/O worker thread. This is only appropriate if the intent is fully + CPU-bound. If any thread-blocking I/O is required, use deferred + execution." + With cycle.ThreadPool, there is an unbounded thread pool. 500 + concurrent clients will probably kill the server. + With cycle.DeferralExecutor, a ThreadPoolExecutor can be chosen that + can limit the number of threads, e.g., by memory. (That is what + MemoryAwareThreadPoolExecutor does. This does not exactly work as + expected, though: The thread pool always keeps the same size; at least + when tested with Thread.sleep(5000) as "blocking code".) + */ + with cycle.DeferralExecutor with cycle.DeferredIntent + // for error handling + with ServerErrorResponse + with LazyLogging { + lazy val underlying = new MemoryAwareThreadPoolExecutor(16, 65536, 1048576) + + // holds session ids mapping to keys and host:port locations, respectively + val session2key: mutable.Map[String, String] = new mutable.HashMap() + val key2session: mutable.Map[String, String] = new mutable.HashMap() + val session2loc: mutable.Map[String, (String, Int)] = new mutable.HashMap() + + /* When starting the processor using spark-submit, we rely on a certain + * logging behavior. It seems like the log4j.xml file bundled with + * the application jar is *not* used when using spark-submit, at least + * not before the file bundled with Spark. To get around this, we create + * a local copy of that log4j file and pass it as a parameter to + * spark-submit. + */ + val tmpLog4jPath: String = try { + val jar = new JarFile(new File(fatjar)) + val log4jFile = jar.getEntry("log4j.xml") + val log4jIs = jar.getInputStream(log4jFile) + val tmpFile = File.createTempFile("log4j", ".xml") + Files.copy(log4jIs, tmpFile.toPath, StandardCopyOption.REPLACE_EXISTING) + tmpFile.deleteOnExit() + tmpFile.getAbsolutePath + } catch { + case e: Throwable => + logger.error("failed to create temporary log4j.xml copy: " + e.getMessage) + throw e + } + logger.debug("extracted log4j.xml file to %s".format(tmpLog4jPath)) + + val errorMsgContentType = ContentType("text/plain; charset=utf-8") + + implicit val formats = DefaultFormats + + def intent = { + case req@POST(Path("/login")) => + var sessionId = "" + var key = "" + val reqSource = req.remoteAddr + logger.info(f"received HTTP request at /login from $reqSource%s") + session2key.synchronized { + do { + sessionId = Alphanumeric.generate(20) // TODO: generate in a more sophisticated way. + } while (session2key.get(sessionId) != None) + do { + key = Alphanumeric.generate(20) // TODO: generate in a more sophisticated way. + } while (key2session.get(key) != None) + session2key += (sessionId -> key) + key2session += (key -> sessionId) + } + val callbackUrl = composeCallbackUrl(ipAddress, port, key) + + val runtime = Runtime.getRuntime + val cmd = mutable.ArrayBuffer(f"$sparkDistribution%s/bin/spark-submit", + "--class", "us.jubat.jubaql_server.processor.JubaQLProcessor", + "--master", "", // set later + "--conf", "", // set later + "--conf", s"log4j.configuration=file:$tmpLog4jPath", + fatjar, + callbackUrl) + logger.info(f"starting Spark in run mode $runMode%s (session_id: $sessionId%s)") + val divide = runMode match { + case RunMode.Production(zookeeper, numExecutors, coresPerExecutor, sparkJar) => + cmd.update(4, "yarn-cluster") // --master + // When we run the processor on YARN, any options passed in with run.mode + // will be passed to the SparkSubmit class, not the the Spark driver. To + // get the run.mode passed one step further, we use the extraJavaOptions + // variable. It is important to NOT ADD ANY QUOTES HERE or they will be + // double-escaped on their way to the Spark driver and probably never end + // up there. + cmd.update(6, "spark.driver.extraJavaOptions=-Drun.mode=production " + + s"-Djubaql.zookeeper=$zookeeper") // --conf + // also specify the location of the Spark jar file, if given + val sparkJarParams = sparkJar match { + case Some(url) => "--conf" :: s"spark.yarn.jar=$url" :: Nil + case _ => Nil + } + cmd.insertAll(9, "--num-executors" :: numExecutors.toString :: + "--executor-cores" :: coresPerExecutor.toString :: sparkJarParams) + logger.debug("executing: " + cmd.mkString(" ")) + + Try { + val maybeProcess = Try(runtime.exec(cmd.toArray, envpForProcessor)) + + maybeProcess.flatMap { process => + // NB. which stream we have to use and whether the message we are + // waiting for actually appears, depends on the log4j.xml file + // bundled in the application jar... + val is: InputStream = process.getInputStream + val isr = new InputStreamReader(is) + val br = new BufferedReader(isr) + var line: String = br.readLine() + while (line != null && line.trim != "yarnAppState: RUNNING") { + if (line.contains("Exception")) { + logger.error(line) + throw new RuntimeException("could not start spark-submit") + } + line = br.readLine() + } + process.destroy() + // TODO: consider to check line is not null here + Success(1) + } + } + case RunMode.Development(numThreads) => + cmd.update(4, s"local[$numThreads]") // --master + cmd.update(6, "run.mode=development") // --conf + logger.debug("executing: " + cmd.mkString(" ")) + + Try { + val maybeProcess = Try(runtime.exec(cmd.toArray)) + + maybeProcess.flatMap { process => + handleSubProcessOutput(process.getInputStream, System.out) + handleSubProcessOutput(process.getErrorStream, System.err) + Success(1) + } + } + case RunMode.Test => + // do nothing in test mode. + Success(1) + } + divide match { + case Success(_) => + logger.info(f"started Spark with callback URL $callbackUrl%s") + val sessionIdJson = write(SessionId(sessionId)) + Ok ~> errorMsgContentType ~> ResponseString(sessionIdJson) + case Failure(e) => + logger.error(e.getMessage) + InternalServerError ~> errorMsgContentType ~> ResponseString("Failed to start Spark\n") + } + + case req@POST(Path("/jubaql")) => + // TODO: treat very long input + val body = readAllFromReader(req.reader) + val reqSource = req.remoteAddr + logger.info(f"received HTTP request at /jubaql from $reqSource%s with body: $body%s") + val maybeJson = org.json4s.native.JsonMethods.parseOpt(body) + val maybeQuery = maybeJson.flatMap(_.extractOpt[Query]) + maybeQuery match { + case None if maybeJson.isEmpty => + logger.warn("received query which is not a JSON") + BadRequest ~> errorMsgContentType ~> ResponseString("Not JSON") + case None => + logger.warn("received an unacceptable JSON query") + BadRequest ~> errorMsgContentType ~> ResponseString("Unacceptable JSON") + case Some(query) => + var maybeKey: Option[String] = None + var maybeLoc: Option[(String, Int)] = None + session2key.synchronized { + maybeKey = session2key.get(query.session_id) + maybeLoc = session2loc.get(query.session_id) + } + (maybeKey, maybeLoc) match { + case (None, None) => + logger.warn("received a query JSON without a usable session_id") + Unauthorized ~> errorMsgContentType ~> ResponseString("Unknown session_id") + case (None, Some(loc)) => + logger.error("inconsistent data in this gateway server") + InternalServerError ~> errorMsgContentType ~> ResponseString("Inconsistent data") + case (Some(key), None) => + logger.warn(s"processor for session $key has not registered yet") + ServiceUnavailable ~> errorMsgContentType ~> + ResponseString("This session has not been registered. Wait a second.") + case (Some(key), Some(loc)) => + // TODO: check forward query + val (host, port) = loc + + val queryJson = write(QueryToProcessor(query.query)).toString + + val url = :/(host, port) / "jubaql" + val req = Http((url.POST << queryJson) > (x => x)) + + logger.debug(f"forward query to processor ($host%s:$port%d)") + req.either.apply() match { + case Left(error) => + logger.error("failed to send request to processor [" + error.getMessage + "]") + BadGateway ~> errorMsgContentType ~> ResponseString("Bad gateway") + case Right(result) => + val statusCode = result.getStatusCode + val responseBody = result.getResponseBody + val contentType = Option(result.getContentType).getOrElse("text/plain; charset=utf-8") + logger.debug(f"got result from processor [$statusCode%d: $responseBody%s]") + Status(statusCode) ~> ContentType(contentType) ~> ResponseString(responseBody) + } + } + } + + + case req@POST(Path(Seg("registration" :: key :: Nil))) => + // parse JSON and extract into case class + val maybeJson = JsonBody(req) + val maybeRegister = maybeJson.flatMap(_.extractOpt[Register]). + filter(_.action == "register") + val maybeUnregister = maybeJson.flatMap(_.extractOpt[Unregister]). + filter(_.action == "unregister") + + if (!maybeRegister.isEmpty) + logger.info(f"start registration (key: $key%s)") + else if (!maybeUnregister.isEmpty) + logger.info(f"start unregistration (key: $key%s)") + else + logger.info(f"start registration or unregistration (key: $key%s)") + + if (maybeJson.isEmpty) { + logger.warn("received query not in JSON format") + BadRequest ~> errorMsgContentType ~> ResponseString("Not JSON") + } else if (maybeRegister.isEmpty && maybeUnregister.isEmpty) { + logger.warn("received unacceptable JSON query") + BadRequest ~> errorMsgContentType ~> ResponseString("Unacceptable JSON") + } else { + session2key.synchronized { + val maybeSessionId = key2session.get(key) + if (!maybeRegister.isEmpty) { // register + val register = maybeRegister.get + val (ip, port) = (register.ip, register.port) + logger.debug(f"registering $ip%s:$port%d") + maybeSessionId match { + case None => + logger.error("attempted to register unknown key") + Unauthorized ~> errorMsgContentType ~> ResponseString("Unknown key") + case Some(sessionId) => + session2loc += (sessionId -> (ip, port)) + Ok ~> errorMsgContentType ~> ResponseString("Successfully registered") + } + } else { // unregister + logger.debug("unregistering") + maybeSessionId match { + case Some(sessionId) => // unregistering an existent key + session2key -= sessionId + key2session -= key + session2loc -= sessionId + case _ => // unregistering a nonexistent key + () + } + Ok ~> errorMsgContentType ~> ResponseString("Successfully unregistered") + } + } + } + } + + private def composeCallbackUrl(ip: String, port: Int, key: String): String = { + f"http://$ip%s:$port%d/registration/$key%s" + } + + private def handleSubProcessOutput(in: InputStream, + out: PrintStream): Unit = { + val thread = new SubProcessOutputHandlerThread(in, out, logger) + thread.setDaemon(true) + thread.start() + } + + private def readAllFromReader(reader: java.io.Reader):String = { + val sb = new StringBuffer() + val buffer = Array[Char](1024) + var nread = reader.read(buffer) + while (nread >= 0) { + sb.append(buffer, 0, nread) + nread = reader.read(buffer) + } + sb.toString + } +} + +// An alphanumeric string generator. +object Alphanumeric { + val random = new Random() + val chars = "0123456789abcdefghijklmnopqrstuvwxyz" + + def generate(length: Int): String = { + val ret = new Array[Char](length) + this.synchronized { + for (i <- 0 until ret.length) { + ret(i) = chars(random.nextInt(chars.length)) + } + } + new String(ret) + } +} + +private class SubProcessOutputHandlerThread(in: InputStream, + out: PrintStream, + logger: com.typesafe.scalalogging.Logger) extends Thread { + override def run(): Unit = { + val reader = new BufferedReader(new InputStreamReader(in)) + try { + var line = reader.readLine() + while (line != null) { + out.println(f"[spark-submit] $line%s") + line = reader.readLine() + } + } catch { + case e: IOException => + logger.warn("caught IOException in subprocess handler") + () + } + // Never close out here. + } +} + +sealed trait RunMode + +object RunMode { + case class Production(zookeeper: String, numExecutors: Int = 3, coresPerExecutor: Int = 2, + sparkJar: Option[String] = None) extends RunMode + case class Development(numThreads: Int = 3) extends RunMode + case object Test extends RunMode +} diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala new file mode 100644 index 0000000..62817f9 --- /dev/null +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala @@ -0,0 +1,119 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway + +import com.typesafe.scalalogging.slf4j.LazyLogging +import scopt.OptionParser + +object JubaQLGateway extends LazyLogging { + val defaultPort = 9877 + + /** Main function to start the JubaQL gateway application. + */ + def main(args: Array[String]) { + val maybeParsedOptions: Option[CommandlineOptions] = parseCommandlineOption(args) + if (maybeParsedOptions.isEmpty) + System.exit(1) + val parsedOptions = maybeParsedOptions.get + + val ipAddress: String = parsedOptions.ip + val port: Int = parsedOptions.port + + var envp: Array[String] = Array() + var runMode: RunMode = RunMode.Development() + val runModeProperty: String = System.getProperty("run.mode") + val sparkJar = Option(System.getProperty("spark.yarn.jar")) + val zookeeperString = scala.util.Properties.propOrElse("jubaql.zookeeper", "") + val devModeRe = "development:([0-9]+)".r + val prodModeRe = "production:([0-9]+):([0-9]+)".r + runModeProperty match { + case null | "" | "development" => + runMode = RunMode.Development() + + case devModeRe(numThreadsString) => + runMode = RunMode.Development(numThreadsString.toInt) + + case "production" => + runMode = RunMode.Production(zookeeperString, sparkJar = sparkJar) + + case prodModeRe(numExecutorsString, coresPerExecutorString) => + runMode = RunMode.Production(zookeeperString, numExecutorsString.toInt, + coresPerExecutorString.toInt, sparkJar = sparkJar) + + case _ => + System.err.println("Bad run.mode property") + System.exit(1) + } + + runMode match { + case p: RunMode.Production => + System.getenv("HADOOP_CONF_DIR") match { + case null => + logger.warn("HADOOP_CONF_DIR not set, using default") + // set HADOOP_CONF_DIR if there is no such environment variable + envp = Array("HADOOP_CONF_DIR=/etc/hadoop/conf") + case path => + envp = Array(s"HADOOP_CONF_DIR=$path") + } + // Require that zookeeper is given in production mode. + // Syntax check must be done by JubaQLProcessor. + if (zookeeperString.trim.isEmpty) { + logger.error("system property jubaql.zookeeper must be given " + + "in production mode (comma-separated host:port list)") + System.exit(1) + } + case _ => + // don't set environment in dev mode + } + logger.info("Starting in run mode %s".format(runMode)) + + val sparkDistribution: String = System.getProperty("spark.distribution") + if (sparkDistribution == null || sparkDistribution.trim.isEmpty) { + System.err.println("No spark.distribution property") + System.exit(1) + } + val fatjar: String = System.getProperty("jubaql.processor.fatjar") + if (fatjar == null || fatjar.trim.isEmpty) { + System.err.println("No jubaql.processor.fatjar") + System.exit(1) + } + val plan = new GatewayPlan(ipAddress, port, envp, runMode, sparkDistribution, fatjar) + val nettyServer = unfiltered.netty.Server.http(port).plan(plan) + logger.info("JubaQLGateway starting") + nettyServer.run() + logger.info("JubaQLGateway shut down successfully") + } + + def parseCommandlineOption(args: Array[String]): Option[CommandlineOptions] = { + val parser = new OptionParser[CommandlineOptions]("JubaQLGateway") { + opt[String]('i', "ip") required() valueName ("") action { + (x, o) => + o.copy(ip = x) + } text ("IP address") + opt[Int]('p', "port") optional() valueName ("") action { + (x, o) => + o.copy(port = x) + } validate { + x => + if (x >= 1 && x <= 65535) success else failure("bad port number; port number n must be \"1 <= n <= 65535\"") + } text (f"port (default: $defaultPort%d)") + } + + parser.parse(args, CommandlineOptions()) + } +} + +case class CommandlineOptions(ip: String = "", port: Int = JubaQLGateway.defaultPort) diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Query.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Query.scala new file mode 100644 index 0000000..01de387 --- /dev/null +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Query.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway.json + +case class Query(session_id: String, query: String) diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/QueryToProcessor.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/QueryToProcessor.scala new file mode 100644 index 0000000..f7cbb64 --- /dev/null +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/QueryToProcessor.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway.json + +case class QueryToProcessor(query: String) diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Register.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Register.scala new file mode 100644 index 0000000..117b763 --- /dev/null +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Register.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway.json + +case class Register(action: String, ip: String, port: Int) diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/SessionId.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/SessionId.scala new file mode 100644 index 0000000..34acd69 --- /dev/null +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/SessionId.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway.json + +case class SessionId(session_id: String) diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Unregister.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Unregister.scala new file mode 100644 index 0000000..bfe57a5 --- /dev/null +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/json/Unregister.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway.json + +case class Unregister(action: String) diff --git a/gateway/src/test/resources/processor-logfile.jar b/gateway/src/test/resources/processor-logfile.jar new file mode 100644 index 0000000000000000000000000000000000000000..050da0bd9889ddb0f0d712c5ad0e75f4b86066fa GIT binary patch literal 728 zcmWIWW@Zs#U|`^2$giw%y`UOwZNS99V9dtAz{w!PkdvQolBHLXn-dzs$-pcmR1pHg zr4`%^j4Ush85qFC){w;P+Xe!E=fxL_w$#pinwJyMG}-7@8{4+658gOS{$6lP+`U&f z^O9v`)UW&H8z-Ag*^)D%$xy87`@PSXUrxGgAIe$$#WDJ_xcrli3??(rZq@wt=y-+1 zTn-D_l1Ym{uipIe>{r$)+kTW>KfPw=i$&kMcJN)AT~hOr*JZPM@u{N=E_;4cj6B;C zxB0Sou+sJJ$JVC7^QMIOr3O#ACSJ(gF_Tpx@Z7?a7n=h`zp;L1jb7BSCPeWI>tXY$ zYyU)Y|JbA_%$y+htnk$8EMKFreGWnf;i+d&mq>DGXJ!k1WQ$v+x;{Q+XPC>2v)9?S zUaVMl$nV<8br+)#=KU+Eyt#T+9EXwbW%nO9m&RY{wX=Sa#xF0g*pzqG^vyC;;q7^| zY##Y4%qe_*MeL}VT<%iGMN1{t$gX9uIKF8ro7;>|(GN%NguH)xDo2&q{E*^~GNw7J z?nHf8Te2zWTyk7`jnA*JtjAfCoMRtld^O#&FKCN)jfm?4-kyE@>lbXY+{d0R;B{92 zxYqRNhegz4_^cW}-x7GHu(U;^>f&Fgwda1u-%Sq75)Zhyw=V6!`>wtx_ijlpijHk; z{>1j~<4r}2^em&`b|Z5uIl;6~b;mjvE!_Siedc2Aw?TU0Jm+_HEW4xhr{Yg+cAnz~ zIWZ3I?0bSA)-bX*id@xh**s5S{m%JZ86ArQ?uX5a%U(TQ-}TYjrEe#EzE$OWIybg& zd+EoJ9W!Mo9_O^4`R2^!vQMgo&d1~Ds@yX^$Ub*}fHxzP95b#&Cjm^13=E7wyrmJu fLP~ + + protected val plan = new GatewayPlan("example.com", 1234, + Array(), RunMode.Test, + "", "src/test/resources/processor-logfile.jar") + protected val server = unfiltered.netty.Server.http(9877).plan(plan) + + override protected def beforeAll() = { + server.start() + } + + override protected def afterAll() = { + server.stop() + } +} diff --git a/gateway/src/test/scala/us/jubat/jubaql_server/gateway/JubaQLSpec.scala b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/JubaQLSpec.scala new file mode 100644 index 0000000..789ab94 --- /dev/null +++ b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/JubaQLSpec.scala @@ -0,0 +1,79 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway + +import us.jubat.jubaql_server.gateway.json.Query +import org.scalatest._ +import EitherValues._ +import dispatch._ +import dispatch.Defaults._ +import org.json4s.DefaultFormats +import org.json4s.native.Serialization.write + +// We use mock processor in this test, so queries are dummies. + +class JubaQLSpec extends FlatSpec with Matchers with ProcessorAndGatewayServer { + val jubaQlUrl = :/("localhost", 9877) / "jubaql" + + implicit val formats = DefaultFormats + + def requestAsJson() = { + val request = (jubaQlUrl).POST + request.setContentType("application/json", "UTF-8") + } + + "Posting jubaql with not a JSON" should "fail" in { + val request = requestAsJson() << "abc" + val result = Http(request > (x => x)).either.apply() + result.right.value.getStatusCode shouldBe 400 + result.right.value.getContentType should include("charset=utf-8") + } + + "Posting jubaql with an empty JSON object" should "fail" in { + val request = requestAsJson() << "{}" + val result = Http(request > (x => x)).either.apply() + result.right.value.getStatusCode shouldBe 400 + result.right.value.getContentType should include("charset=utf-8") + } + + "Posting jubaql without session_id" should "fail" in { + val request = requestAsJson() << """{"query": "query"}""" + val result = Http(request > (x => x)).either.apply() + result.right.value.getStatusCode shouldBe 400 + result.right.value.getContentType should include("charset=utf-8") + } + + "Posting jubaql without query" should "fail" in { + val request = requestAsJson() << f"""{"session_id": "$session%s"}""" + val result = Http(request > (x => x)).either.apply() + result.right.value.getStatusCode shouldBe 400 + result.right.value.getContentType should include("charset=utf-8") + } + + "Posting jubaql with unknown session_id" should "fail" in { + val request = requestAsJson() << write(Query("NOSUCHID", "query")).toString + val result = Http(request > (x => x)).either.apply() + result.right.value.getStatusCode shouldBe 401 + result.right.value.getContentType should include("charset=utf-8") + } + + "Posting jubaql with a valid JSON" should "succeed" in { + val request = requestAsJson() << write(Query(session, "query")).toString + val result = Http(request > (x => x)).either.apply() + result.right.value.getStatusCode shouldBe 200 + result.right.value.getContentType should include("charset=utf-8") + } +} diff --git a/gateway/src/test/scala/us/jubat/jubaql_server/gateway/LoginSpec.scala b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/LoginSpec.scala new file mode 100644 index 0000000..b804266 --- /dev/null +++ b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/LoginSpec.scala @@ -0,0 +1,56 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway + +import us.jubat.jubaql_server.gateway.json.SessionId +import org.scalatest._ +import dispatch._ +import dispatch.Defaults._ +import org.json4s._ +import org.json4s.Formats._ +import org.json4s.native.Serialization.{read, write} +import org.json4s.native.JsonMethods._ + +class LoginSpec extends FlatSpec with Matchers with GatewayServer { + + implicit val formats = DefaultFormats + + val url = :/("localhost", 9877) / "login" + + "POST to /login" should "return something" in { + val req = Http(url.POST OK as.String) + req.option.apply() should not be None + } + + "POST to /login" should "return a JSON" in { + val req = Http(url.POST OK as.String) + req.option.apply() should not be None + val returnedString = req.option.apply.get + val maybeJson = parseOpt(returnedString) + maybeJson should not be None + } + + "POST to /login" should "return a JSON which contains session_id" in { + val req = Http(url.POST OK as.String) + req.option.apply() should not be None + val returnedString = req.option.apply.get + val maybeJson = parseOpt(returnedString) + maybeJson should not be None + val maybeSessionId = maybeJson.get.extractOpt[SessionId] + maybeSessionId should not be None + maybeSessionId.get.session_id.length should be > 0 + } +} diff --git a/gateway/src/test/scala/us/jubat/jubaql_server/gateway/ProcessorAndGatewayServer.scala b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/ProcessorAndGatewayServer.scala new file mode 100644 index 0000000..50d4f59 --- /dev/null +++ b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/ProcessorAndGatewayServer.scala @@ -0,0 +1,79 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway + +import us.jubat.jubaql_server.gateway.json.QueryToProcessor +import com.typesafe.scalalogging.slf4j.LazyLogging +import org.jboss.netty.handler.execution.MemoryAwareThreadPoolExecutor +import unfiltered.request._ +import unfiltered.response._ +import unfiltered.netty.{cycle, ServerErrorResponse} +import unfiltered.util.RunnableServer +import org.scalatest.{Suite, BeforeAndAfterAll} +import org.json4s._ + +trait ProcessorAndGatewayServer extends GatewayServer { + this: Suite => + + val session = "TESTSESSIONID" + // This variable does not have name "key" due to multiple inheritance conflict + val key_ = "KEY" + val loc = ("localhost", 9876) + + protected val processorMock: RunnableServer = + unfiltered.netty.Server.http(9876).plan( + new ProcessorMockPlan + ) + + override def beforeAll(): Unit = { + plan.session2key += (session -> key_) + plan.key2session += (key_ -> session) + plan.session2loc += (session -> loc) + super.beforeAll() + + processorMock.start() + } + + override def afterAll(): Unit = { + processorMock.stop() + + super.afterAll() + } +} + +class ProcessorMockPlan + extends cycle.Plan + with cycle.DeferralExecutor with cycle.DeferredIntent + with ServerErrorResponse + with LazyLogging { + + lazy val underlying = new MemoryAwareThreadPoolExecutor(16, 65536, 1048576) + + implicit val formats = DefaultFormats + + def intent = { + case req@POST(Path("/jubaql")) => + val maybeJson = JsonBody(req) + val maybeQuery = maybeJson.flatMap(_.extractOpt[QueryToProcessor]) + maybeQuery match { + case None => + BadRequest ~> ResponseString("Valid JSON is required") + case Some(query) => + val queryString = query.query + Ok ~> ResponseString("Valid JSON") + } + } +} diff --git a/gateway/src/test/scala/us/jubat/jubaql_server/gateway/RegisterSpec.scala b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/RegisterSpec.scala new file mode 100644 index 0000000..c2cb8f7 --- /dev/null +++ b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/RegisterSpec.scala @@ -0,0 +1,120 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway + +import us.jubat.jubaql_server.gateway.json.Register +import org.scalatest._ +import org.json4s._ +import dispatch._ +import dispatch.Defaults._ +import org.json4s.Formats._ +import org.json4s.native.Serialization.{read, write} +import org.json4s.native.JsonMethods._ +import EitherValues._ + +class RegisterSpec extends FlatSpec with Matchers with GatewayServer { + + implicit val formats = DefaultFormats + + val loginUrl = :/("localhost", 9877) / "login" + val registrationUrl = :/("localhost", 9877) / "registration" + + def requestAsJson(key: String) = { + val request = (registrationUrl / key).POST + request.setContentType("application/json", "UTF-8") + } + + override def beforeAll(): Unit = { + super.beforeAll() + // login twice + for (i <- 0 until 2) { + val req = Http(loginUrl.POST OK as.String) + req.option.apply() + } + } + + def keys = { + var keys = List.empty[String] + // This lock is required because the server is in the same process. + plan.session2key.synchronized { + for (key <- plan.key2session.keys) + keys = key :: keys + } + keys + } + + "Registering with not a JSON" should "fail" in { + for (key <- keys) { + val requestWithBody = requestAsJson(key) << "abc" + val result = Http(requestWithBody OK as.String).either.apply() + result.left.value.getMessage shouldBe "Unexpected response status: 400" + } + } + + "Registering with a JSON which is not for registering" should "fail" in { + for (key <- keys) { + val requestWithBody = requestAsJson(key) << """{"ip": "8.8.8.8", "port": 30}""" + val result = Http(requestWithBody OK as.String).either.apply() + result.left.value.getMessage shouldBe "Unexpected response status: 400" + } + } + + "Registering a nonexistent key" should "fail" in { + for (key <- keys) { + val garbageKey = key + "garbage" + val registerJson = write(Register("register", "8.8.8.8", 30)) + val requestWithBody = requestAsJson(garbageKey) << registerJson.toString + val result = Http(requestWithBody OK as.String).either.apply() + result.left.value.getMessage shouldBe "Unexpected response status: 401" + } + } + + "Registering an existing key" should "succeed" in { + for (key <- keys) { + val registerJson = write(Register("register", "8.8.8.8", 30)) + val requestWithBody = requestAsJson(key) << registerJson.toString + Http(requestWithBody OK as.String).option.apply() should not be None + plan.session2key.synchronized { + val session = plan.key2session.get(key) + session should not be None + val maybeLoc = plan.session2loc.get(session.get) + maybeLoc should not be None + val loc = maybeLoc.get + loc shouldBe ("8.8.8.8", 30) + } + } + } + + "Registering already registered settings" should "overwrite" in { + for (key <- keys) { + for (i <- 0 until 2) { + val ip = "8.8.8." + (8 + i).toString + val port = 30 + i + val registerJson = write(Register("register", ip, port)) + val requestWithBody = requestAsJson(key) << registerJson.toString + Http(requestWithBody OK as.String).option.apply() should not be None + plan.session2key.synchronized { + val session = plan.key2session.get(key) + session should not be None + val maybeLoc = plan.session2loc.get(session.get) + maybeLoc should not be None + val loc = maybeLoc.get + loc shouldBe (ip, port) + } + } + } + } +} diff --git a/gateway/src/test/scala/us/jubat/jubaql_server/gateway/UnregisterSpec.scala b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/UnregisterSpec.scala new file mode 100644 index 0000000..fb0b895 --- /dev/null +++ b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/UnregisterSpec.scala @@ -0,0 +1,113 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.gateway + +import dispatch.Defaults._ +import dispatch._ +import us.jubat.jubaql_server.gateway.json.Register +import org.json4s._ +import org.json4s.native.Serialization.write +import org.scalatest._ +import EitherValues._ + +class UnregisterSpec extends FlatSpec with Matchers with GatewayServer with BeforeAndAfter { + + implicit val formats = DefaultFormats + + val loginUrl = :/("localhost", 9877) / "login" + val registrationUrl = :/("localhost", 9877) / "registration" + + val registerJson = write(Register("register", "8.8.8.8", 30)).toString + val unregisterJson = """{"action": "unregister"}""" + + def requestAsJson(key: String) = { + val request = (registrationUrl / key).POST + request.setContentType("application/json", "UTF-8") + } + + before { + // login twice + for (i <- 0 until 2) { + val req = Http(loginUrl.POST OK as.String) + req.option.apply() + } + + // register all keys + for (key <- keys) { + val requestWithBody = requestAsJson(key) << registerJson + Http(requestWithBody OK as.String).option.apply() + } + } + + after { + // unregister all keys + for (key <- keys) { + val requestWithBody = requestAsJson(key) << unregisterJson + Http(requestWithBody OK as.String).option.apply() + } + } + + def keys = { + var keys = List.empty[String] + // This lock is required because the server is in the same process. + plan.session2key.synchronized { + for (key <- plan.key2session.keys) + keys = key :: keys + } + keys + } + + "Unregistering with not a JSON" should "fail" in { + for (key <- keys) { + val requestWithNotAJson = requestAsJson(key) << "abc" + val result = Http(requestWithNotAJson OK as.String).either.apply() + result.left.value.getMessage shouldBe "Unexpected response status: 400" + } + } + + "Unregistering with a JSON which in not for unregisttering" should "fail" in { + for (key <- keys) { + val requestWithBadJson = requestAsJson(key) << "{}" + val result = Http(requestWithBadJson OK as.String).either.apply() + result.left.value.getMessage shouldBe "Unexpected response status: 400" + } + } + + "Unregistering a nonexistent key" should "succeed" in { + for (key <- keys) { + val garbageKey = key + "garabage" + val requestWithBody = requestAsJson(garbageKey) << unregisterJson + val result = Http(requestWithBody OK as.String).either.apply() + result.right.value shouldBe "Successfully unregistered" + } + } + + "Unregistering an existing key" should "succeed" in { + for (key <- keys) { + var sessionId = "" + plan.session2key.synchronized { + sessionId = plan.key2session.get(key).get + } + val requestWithBody = requestAsJson(key) << unregisterJson + Http(requestWithBody OK as.String).option.apply() should not be None + plan.session2key.synchronized { + plan.session2key.get(sessionId) shouldBe None + plan.key2session.get(key) shouldBe None + plan.session2loc.get(sessionId) shouldBe None + } + } + } +} diff --git a/processor/assembly.sbt b/processor/assembly.sbt new file mode 100644 index 0000000..5e3272c --- /dev/null +++ b/processor/assembly.sbt @@ -0,0 +1,61 @@ +import AssemblyKeys._ + +assemblySettings + +test in assembly := {} + +jarName in assembly := "jubaql-processor-assembly-" + version.value + ".jar" + +/// We MUST include Scala libraries, otherwise scalalogging won't +/// be included: +// assemblyOption in assembly ~= { +// _.copy(includeScala = false) +// } + +mergeStrategy in assembly <<= (mergeStrategy in assembly) { + (old) => { + //// The following conflicts only need to be fixed when Spark dependencies + //// are not marked as "provided": + // + // javax.transaction-1.1.1.v201105210645.jar:META-INF/ECLIPSEF.RSA vs. + // javax.servlet-3.0.0.v201112011016.jar:META-INF/ECLIPSEF.RSA vs. + // javax.mail.glassfish-1.4.1.v201005082020.jar:META-INF/ECLIPSEF.RSA vs. + // javax.activation-1.1.0.v201105071233.jar:META-INF/ECLIPSEF.RSA + case x if x.startsWith("META-INF/ECLIPSEF.RSA") => MergeStrategy.discard + // javax.mail.glassfish-1.4.1.v201005082020.jar:META-INF/mailcap vs. + // javax.activation-1.1.0.v201105071233.jar:META-INF/mailcap + case x if x.startsWith("META-INF/mailcap") => MergeStrategy.last + // slf4j-api-1.7.7.jar:META-INF/maven/org.slf4j/slf4j-api/pom.properties vs. + // parquet-format-2.0.0.jar:META-INF/maven/org.slf4j/slf4j-api/pom.properties + // and others + case x if x.startsWith("META-INF/maven/org.slf4j/") => MergeStrategy.last + // kryo-2.21.jar:com/esotericsoftware/minlog/Log$Logger.class vs. + // minlog-1.2.jar:com/esotericsoftware/minlog/Log$Logger.class + case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last + // commons-beanutils-1.7.0.jar:org/apache/commons/beanutils/BasicDynaBean.class vs. + // commons-beanutils-core-1.8.0.jar:org/apache/commons/beanutils/BasicDynaBean.class + // and others + case PathList("org", "apache", xs @ _*) => MergeStrategy.last + // javax.transaction-1.1.1.v201105210645.jar:plugin.properties vs. + // javax.servlet-3.0.0.v201112011016.jar:plugin.properties vs. + // javax.mail.glassfish-1.4.1.v201005082020.jar:plugin.properties vs. + // javax.activation-1.1.0.v201105071233.jar:plugin.properties + // and others + case x if x.startsWith("plugin.properties") => MergeStrategy.last + // jubatus-on-yarn-client_2.10.jar:log4j.xml vs. + // our own log4j.xml + case x if x.startsWith("log4j.xml") => MergeStrategy.first + // + case x => old(x) + } +} + +// take only the Spark and Hadoop jars out (this is more or less an +// alternative to marking Spark as "provided") +excludedJars in assembly <<= (fullClasspath in assembly) map { cp => + cp filter {item => item.data.getPath.contains("/org.apache.hadoop/")} +} + +// add "provided" dependencies back to classpath when using "sbt run". +// this does not affect the "run" function in IDEA (i.e., it can't be used) +run in Compile <<= Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)) diff --git a/processor/build.sbt b/processor/build.sbt new file mode 100644 index 0000000..5bd2741 --- /dev/null +++ b/processor/build.sbt @@ -0,0 +1,122 @@ +import com.typesafe.sbt.SbtStartScript +import java.io.File + +name := "JubaQL Processor" + +version := "1.2.0" + +// use 2.10 for now (Spark has no 2.11 support yet) +scalaVersion := "2.10.4" + +// to prevent problems with encfs path length issues +scalacOptions ++= Seq( "-Xmax-classfile-name", "140" ) + +// Add Jubatus repository +resolvers += "Jubatus" at "http://download.jubat.us/maven" + +// Add Cloudera repository +resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/" + +// Add msgpack repository (sbt does not use the information provided in the Jubatus POM) +resolvers += "MessagePack" at "http://msgpack.org/maven2" + +libraryDependencies ++= Seq( + // logging + "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", + "org.slf4j" % "slf4j-api" % "1.7.7", + "org.slf4j" % "slf4j-log4j12" % "1.7.7", + // Jubatus + "us.jubat" % "jubatus" % "0.6.0" + exclude("org.jboss.netty", "netty"), + // jubatusonyarn + "us.jubat" %% "jubatus-on-yarn-client" % "1.0" + exclude("javax.servlet", "servlet-api") + exclude("org.jboss.netty", "netty"), + // HTTP server + "com.twitter" %% "finagle-http" % "6.7.4", + "org.json4s" %% "json4s-native" % "3.2.10", + "org.json4s" %% "json4s-ext" % "3.2.10", + // parsing of program arguments + "com.github.scopt" %% "scopt" % "3.2.0", + // Spark + "org.apache.spark" %% "spark-core" % "1.1.1" % "provided", + // the following will prevent org.spark-project.akka:akka-remote_2.10:2.2.3-shaded-protobuf + // from pulling in io.netty:netty:3.6.6.Final, but it will not prevent spark-core + // itself to pull in io.netty:netty-all:4.0.23.Final (note that the former + // includes the package "org.jboss.netty", while the latter includes "io.netty".) + "org.spark-project.akka" %% "akka-remote" % "2.2.3-shaded-protobuf" + exclude("io.netty", "netty"), + "org.apache.spark" %% "spark-streaming" % "1.1.1" % "provided", + "org.apache.spark" %% "spark-streaming-kafka" % "1.1.1" + exclude("commons-beanutils", "commons-beanutils") + exclude("commons-collections", "commons-collections") + exclude("com.esotericsoftware.minlog", "minlog"), + "org.apache.spark" %% "spark-sql" % "1.1.1", + // registration with the gateway + "net.databinder.dispatch" %% "dispatch-core" % "0.11.2", + // HDFS + "org.apache.hadoop" % "hadoop-client" % "2.5.0-cdh5.2.0" % "provided", + // for testing + "org.scalatest" %% "scalatest" % "2.2.1" % "test", + "net.databinder" %% "unfiltered-filter" % "0.8.2" % "test", + "net.databinder" %% "unfiltered-json4s" % "0.8.2" % "test", + "net.databinder" %% "unfiltered-netty-server" % "0.8.2" % "test" +) + +// disable parallel test execution to avoid conflicting to launch jubatus when mocking +// Jubatus servers +parallelExecution in Test := false + +net.virtualvoid.sbt.graph.Plugin.graphSettings + +// add the "start-script" task as per +// +seq(SbtStartScript.startScriptForClassesSettings: _*) + +SbtStartScript.StartScriptKeys.startScriptName <<= baseDirectory / "start-script/run" + +// add "provided" dependencies back to classpath when using "sbt start-script". +SbtStartScript.StartScriptKeys.relativeFullClasspathString in Compile <<= + (SbtStartScript.StartScriptKeys.startScriptBaseDirectory, fullClasspath in Compile) map myRelativeClasspathStringTask + +// the three functions below are 1:1 copies (with changed names) from +// SbtStartScript.scala, with the `private` modifier removed because there seems +// to be no other way to modify the classpath for sbt-start-script; +// cf. + +def myRelativeClasspathStringTask(baseDirectory: File, cp: Classpath) = { + SbtStartScript.RelativeClasspathString(cp.files map { f => myRelativizeFile(baseDirectory, f, "$PROJECT_DIR") } mkString ("", java.io.File.pathSeparator, "")) +} + +def myRelativizeFile(baseDirectory: File, f: File, prefix: String = ".") = { + if (java.io.File.separatorChar != '/') { + f + } else { + val baseCanonical = baseDirectory.getCanonicalFile() + val fCanonical = f.getCanonicalFile() + if (myDirectoryEqualsOrContains(baseCanonical, fCanonical)) { + val basePath = baseCanonical.getAbsolutePath() + val fPath = fCanonical.getAbsolutePath() + if (fPath.startsWith(basePath)) { + new File(prefix + fPath.substring(basePath.length)) + } else { + sys.error("Internal bug: %s contains %s but is not a prefix of it".format(basePath, fPath)) + } + } else { + // leave it as-is, don't even canonicalize + f + } + } +} + +def myDirectoryEqualsOrContains(d: File, f: File): Boolean = { + if (d == f) { + true + } else { + val p = f.getParentFile() + if (p == null) + false + else + myDirectoryEqualsOrContains(d, p) + } +} diff --git a/processor/project/assembly.sbt b/processor/project/assembly.sbt new file mode 100644 index 0000000..54c3252 --- /dev/null +++ b/processor/project/assembly.sbt @@ -0,0 +1 @@ +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") diff --git a/processor/project/deptree.sbt b/processor/project/deptree.sbt new file mode 100644 index 0000000..3c9aed7 --- /dev/null +++ b/processor/project/deptree.sbt @@ -0,0 +1 @@ +addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") diff --git a/processor/project/plugins.sbt b/processor/project/plugins.sbt new file mode 100644 index 0000000..c67c87f --- /dev/null +++ b/processor/project/plugins.sbt @@ -0,0 +1 @@ +addSbtPlugin("com.typesafe.sbt" % "sbt-start-script" % "0.10.0") diff --git a/processor/src/main/resources/core-site.xml.dist b/processor/src/main/resources/core-site.xml.dist new file mode 100644 index 0000000..3e3540a --- /dev/null +++ b/processor/src/main/resources/core-site.xml.dist @@ -0,0 +1,128 @@ + + + + + + fs.defaultFS + hdfs://[host]:[port] + + + fs.trash.interval + 1 + + + io.compression.codecs + org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec + + + hadoop.security.authentication + simple + + + hadoop.security.authorization + false + + + hadoop.rpc.protection + authentication + + + hadoop.ssl.require.client.cert + false + true + + + hadoop.ssl.keystores.factory.class + org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory + true + + + hadoop.ssl.server.conf + ssl-server.xml + true + + + hadoop.ssl.client.conf + ssl-client.xml + true + + + hadoop.security.auth_to_local + DEFAULT + + + hadoop.proxyuser.oozie.hosts + * + + + hadoop.proxyuser.oozie.groups + * + + + hadoop.proxyuser.mapred.hosts + * + + + hadoop.proxyuser.mapred.groups + * + + + hadoop.proxyuser.flume.hosts + * + + + hadoop.proxyuser.flume.groups + * + + + hadoop.proxyuser.HTTP.hosts + * + + + hadoop.proxyuser.HTTP.groups + * + + + hadoop.proxyuser.hive.hosts + * + + + hadoop.proxyuser.hive.groups + * + + + hadoop.proxyuser.hue.hosts + * + + + hadoop.proxyuser.hue.groups + * + + + hadoop.proxyuser.httpfs.hosts + * + + + hadoop.proxyuser.httpfs.groups + * + + + hadoop.proxyuser.hdfs.groups + * + + + hadoop.proxyuser.hdfs.hosts + * + + + hadoop.security.group.mapping + org.apache.hadoop.security.ShellBasedUnixGroupsMapping + + + hadoop.security.instrumentation.requires.admin + false + + diff --git a/processor/src/main/resources/hdfs-site.xml.dist b/processor/src/main/resources/hdfs-site.xml.dist new file mode 100644 index 0000000..790de19 --- /dev/null +++ b/processor/src/main/resources/hdfs-site.xml.dist @@ -0,0 +1,68 @@ + + + + + + dfs.namenode.name.dir + file:///dfs/nn + + + dfs.namenode.servicerpc-address + [host]:[port] + + + dfs.https.address + [host]:[port] + + + dfs.https.port + 50470 + + + dfs.namenode.http-address + [host]:[port] + + + dfs.replication + 3 + + + dfs.blocksize + 134217728 + + + dfs.client.use.datanode.hostname + false + + + fs.permissions.umask-mode + 022 + + + dfs.namenode.acls.enabled + false + + + dfs.client.read.shortcircuit + false + + + dfs.domain.socket.path + /var/run/hdfs-sockets/dn + + + dfs.client.read.shortcircuit.skip.checksum + false + + + dfs.client.domain.socket.data.traffic + false + + + dfs.datanode.hdfs-blocks-metadata.enabled + true + + diff --git a/processor/src/main/resources/log4j.xml b/processor/src/main/resources/log4j.xml new file mode 100644 index 0000000..ee27083 --- /dev/null +++ b/processor/src/main/resources/log4j.xml @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/processor/src/main/resources/yarn-site.xml.dist b/processor/src/main/resources/yarn-site.xml.dist new file mode 100644 index 0000000..ad2b61a --- /dev/null +++ b/processor/src/main/resources/yarn-site.xml.dist @@ -0,0 +1,136 @@ + + + + + + yarn.acl.enable + true + + + yarn.admin.acl + * + + + yarn.resourcemanager.address + [host]:[port] + + + yarn.resourcemanager.admin.address + [host]:[port] + + + yarn.resourcemanager.scheduler.address + [host]:[port] + + + yarn.resourcemanager.resource-tracker.address + [host]:[port] + + + yarn.resourcemanager.webapp.address + [host]:[port] + + + yarn.resourcemanager.webapp.https.address + [host]:[port] + + + yarn.resourcemanager.client.thread-count + 50 + + + yarn.resourcemanager.scheduler.client.thread-count + 50 + + + yarn.resourcemanager.admin.client.thread-count + 1 + + + yarn.scheduler.minimum-allocation-mb + 1024 + + + yarn.scheduler.increment-allocation-mb + 512 + + + yarn.scheduler.maximum-allocation-mb + 6538 + + + yarn.scheduler.minimum-allocation-vcores + 1 + + + yarn.scheduler.increment-allocation-vcores + 1 + + + yarn.scheduler.maximum-allocation-vcores + 4 + + + yarn.resourcemanager.amliveliness-monitor.interval-ms + 1000 + + + yarn.am.liveness-monitor.expiry-interval-ms + 600000 + + + yarn.resourcemanager.am.max-attempts + 2 + + + yarn.resourcemanager.container.liveness-monitor.interval-ms + 600000 + + + yarn.resourcemanager.nm.liveness-monitor.interval-ms + 1000 + + + yarn.nm.liveness-monitor.expiry-interval-ms + 600000 + + + yarn.resourcemanager.resource-tracker.client.thread-count + 50 + + + yarn.application.classpath + $HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/* + + + yarn.resourcemanager.scheduler.class + org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler + + + yarn.scheduler.fair.user-as-default-queue + true + + + yarn.scheduler.fair.preemption + false + + + yarn.scheduler.fair.sizebasedweight + false + + + yarn.scheduler.fair.assignmultiple + false + + + yarn.resourcemanager.max-completed-applications + 10000 + + + yarn.nodemanager.aux-services + + + diff --git a/processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala b/processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala new file mode 100644 index 0000000..9f13814 --- /dev/null +++ b/processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala @@ -0,0 +1,272 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * --- + * + * Based on FileInputDStream from the Apache Spark 1.1.0 distribution. + */ + +package org.apache.spark.streaming.dstream + +import java.io.{ObjectInputStream, IOException} +import scala.collection.mutable.{HashSet, HashMap} +import scala.reflect.ClassTag +import org.apache.hadoop.fs.{FileSystem, Path, PathFilter} +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat} +import org.apache.spark.rdd.RDD +import org.apache.spark.rdd.UnionRDD +import org.apache.spark.streaming.{StreamingContext, Time} +import org.apache.spark.util.TimeStampedHashMap + + +class OrderedFileInputDStream[K: ClassTag, V: ClassTag, F <: NewInputFormat[K,V] : ClassTag]( + @transient ssc_ : StreamingContext, + directory: String, + filter: Path => Boolean = OrderedFileInputDStream.defaultFilter, + newFilesOnly: Boolean = true) + extends InputDStream[(K, V)](ssc_) { + + protected[streaming] override val checkpointData = new OrderedFileInputDStreamCheckpointData + + // files found in the last interval + private val lastFoundFiles = new HashSet[String] + + // Files with mod time earlier than this is ignored. This is updated every interval + // such that in the current interval, files older than any file found in the + // previous interval will be ignored. Obviously this time keeps moving forward. + private var ignoreTime = if (newFilesOnly) System.currentTimeMillis() else 0L + + // Latest file mod time seen till any point of time + @transient private var path_ : Path = null + @transient private var fs_ : FileSystem = null + @transient private[streaming] var files = new HashMap[Time, Array[String]] + @transient private var fileModTimes = new TimeStampedHashMap[String, Long](true) + @transient private var lastNewFileFindingTime = 0L + + override def start() { } + + override def stop() { } + + /** + * Finds the files that were modified since the last time this method was called and makes + * a union RDD out of them. Note that this maintains the list of files that were processed + * in the latest modification time in the previous call to this method. This is because the + * modification time returned by the FileStatus API seems to return times only at the + * granularity of seconds. And new files may have the same modification time as the + * latest modification time in the previous call to this method yet was not reported in + * the previous call. + */ + override def compute(validTime: Time): Option[RDD[(K, V)]] = { + assert(validTime.milliseconds >= ignoreTime, + "Trying to get new files for a really old time [" + validTime + " < " + ignoreTime + "]") + + // Find new files + val (newFiles, minNewFileModTime) = findNewFiles(validTime.milliseconds) + logInfo("New files at time " + validTime + ":\n" + newFiles.mkString("\n")) + if (!newFiles.isEmpty) { + lastFoundFiles.clear() + lastFoundFiles ++= newFiles + ignoreTime = minNewFileModTime + } + files += ((validTime, newFiles.toArray)) + Some(filesToRDD(newFiles)) + } + + /** Clear the old time-to-files mappings along with old RDDs */ + protected[streaming] override def clearMetadata(time: Time) { + super.clearMetadata(time) + val oldFiles = files.filter(_._1 < (time - rememberDuration)) + files --= oldFiles.keys + logInfo("Cleared " + oldFiles.size + " old files that were older than " + + (time - rememberDuration) + ": " + oldFiles.keys.mkString(", ")) + logDebug("Cleared files are:\n" + + oldFiles.map(p => (p._1, p._2.mkString(", "))).mkString("\n")) + // Delete file mod times that weren't accessed in the last round of getting new files + fileModTimes.clearOldValues(lastNewFileFindingTime - 1) + } + + /** + * Find files which have modification timestamp <= current time and return a 3-tuple of + * (new files found, latest modification time among them, files with latest modification time) + */ + private def findNewFiles(currentTime: Long): (Seq[String], Long) = { + logDebug("Trying to get new files for time " + currentTime) + lastNewFileFindingTime = System.currentTimeMillis + // Heuristic: We look at files with a modification time less than five minutes + // in the past. We therefore eliminate files that have seen updates in the + // last five minutes, i.e. (hopefully) those files currently being appended to. + val filter = new CustomPathFilter(currentTime - 5*60*1000) + // sort the files from oldest to newest, just to be sure + val fileStates = fs.listStatus(directoryPath, filter).sortBy(_.getModificationTime) + val newFiles = fileStates.map(_.getPath.toString) + val timeTaken = System.currentTimeMillis - lastNewFileFindingTime + logInfo("Finding new files took " + timeTaken + " ms") + logDebug("# cached file times = " + fileModTimes.size) + if (timeTaken > slideDuration.milliseconds) { + logWarning( + "Time taken to find new files exceeds the batch size. " + + "Consider increasing the batch size or reduceing the number of " + + "files in the monitored directory." + ) + } + (newFiles, filter.minNewFileModTime) + } + + /** Generate one RDD from an array of files */ + private def filesToRDD(files: Seq[String]): RDD[(K, V)] = { + val fileRDDs = files.map(file => context.sparkContext.newAPIHadoopFile[K, V, F](file)) + files.zip(fileRDDs).foreach { case (file, rdd) => { + if (rdd.partitions.size == 0) { + logError("File " + file + " has no data in it. Spark Streaming can only ingest " + + "files that have been \"moved\" to the directory assigned to the file stream. " + + "Refer to the streaming programming guide for more details.") + } + }} + new UnionRDD(context.sparkContext, fileRDDs) + } + + private def directoryPath: Path = { + if (path_ == null) path_ = new Path(directory) + path_ + } + + private def fs: FileSystem = { + if (fs_ == null) fs_ = directoryPath.getFileSystem(new Configuration()) + fs_ + } + + private def getFileModTime(path: Path) = { + // Get file mod time from cache or fetch it from the file system + fileModTimes.getOrElseUpdate(path.toString, fs.getFileStatus(path).getModificationTime()) + } + + private def reset() { + fs_ = null + } + + @throws(classOf[IOException]) + private def readObject(ois: ObjectInputStream) { + logDebug(this.getClass().getSimpleName + ".readObject used") + ois.defaultReadObject() + generatedRDDs = new HashMap[Time, RDD[(K,V)]] () + files = new HashMap[Time, Array[String]] + fileModTimes = new TimeStampedHashMap[String, Long](true) + } + + /** + * A custom version of the DStreamCheckpointData that stores names of + * Hadoop files as checkpoint data. + */ + private[streaming] + class OrderedFileInputDStreamCheckpointData extends DStreamCheckpointData(this) { + + def hadoopFiles = data.asInstanceOf[HashMap[Time, Array[String]]] + + override def update(time: Time) { + hadoopFiles.clear() + hadoopFiles ++= files + } + + override def cleanup(time: Time) { } + + override def restore() { + hadoopFiles.toSeq.sortBy(_._1)(Time.ordering).foreach { + case (t, f) => { + // Restore the metadata in both files and generatedRDDs + logInfo("Restoring files for time " + t + " - " + + f.mkString("[", ", ", "]") ) + files += ((t, f)) + generatedRDDs += ((t, filesToRDD(f))) + } + } + } + + override def toString() = { + "[\n" + hadoopFiles.size + " file sets\n" + + hadoopFiles.map(p => (p._1, p._2.mkString(", "))).mkString("\n") + "\n]" + } + } + + /** + * Custom PathFilter class to find new files that + * ... have modification time more than ignore time + * ... have not been seen in the last interval + * ... have modification time less than maxModTime + */ + private[streaming] + class CustomPathFilter(maxModTime: Long) extends PathFilter { + + // Minimum of the mod times of new files found in the current interval + var minNewFileModTime = -1L + + def accept(path: Path): Boolean = { + try { + if (!filter(path)) { // Reject file if it does not satisfy filter + logDebug("Rejected by filter " + path) + return false + } + // Reject file if it was found in the last interval + if (lastFoundFiles.contains(path.toString)) { + logDebug("Mod time equal to last mod time, but file considered already") + return false + } + val modTime = getFileModTime(path) + logDebug("Mod time for " + path + " is " + modTime) + if (modTime < ignoreTime) { + // Reject file if it was created before the ignore time (or, before last interval) + logDebug("Mod time " + modTime + " less than ignore time " + ignoreTime) + return false + } else if (modTime > maxModTime) { + // Reject file if it is too new that considering it may give errors + logDebug("Mod time more than ") + return false + } + if (minNewFileModTime < 0 || modTime < minNewFileModTime) { + minNewFileModTime = modTime + } + logDebug("Accepted " + path) + } catch { + case fnfe: java.io.FileNotFoundException => + logWarning("Error finding new files", fnfe) + reset() + return false + } + true + } + } +} + +private[streaming] +object OrderedFileInputDStream { + def defaultFilter(path: Path): Boolean = !path.getName().startsWith(".") +} + diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala new file mode 100644 index 0000000..0929820 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala @@ -0,0 +1,41 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import com.typesafe.scalalogging.slf4j.LazyLogging +import com.twitter.finagle.{SimpleFilter, Service} +import io.netty.util.CharsetUtil +import org.jboss.netty.handler.codec.http._ +import org.jboss.netty.buffer.ChannelBuffers +import org.json4s.JsonDSL._ +import org.json4s.native.JsonMethods._ + +class HandleExceptions + extends SimpleFilter[HttpRequest, HttpResponse] + with LazyLogging { + def apply(request: HttpRequest, service: Service[HttpRequest, HttpResponse]) = { + // `handle` asynchronously handles exceptions. + service(request) handle { + case error => + logger.error(error.getStackTraceString) + val statusCode = HttpResponseStatus.INTERNAL_SERVER_ERROR + val body = ("result" -> error.getMessage) + val errorResponse = new DefaultHttpResponse(HttpVersion.HTTP_1_1, statusCode) + errorResponse.setContent(ChannelBuffers.copiedBuffer(compact(render(body)), CharsetUtil.UTF_8)) + errorResponse + } + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala new file mode 100644 index 0000000..a0c86db --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala @@ -0,0 +1,453 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import scala.concurrent.future +import scala.concurrent.ExecutionContext.Implicits.global +import org.apache.spark.SparkContext +import org.apache.spark.streaming.{Seconds, StreamingContext} +import org.apache.hadoop.io.{Text, LongWritable} +import org.apache.hadoop.mapreduce.lib.input.TextInputFormat +import org.apache.hadoop.fs.Path +import com.typesafe.scalalogging.slf4j.LazyLogging +import scala.util.matching.Regex +import org.apache.spark.streaming.dstream.{OrderedFileInputDStream, ConstantInputDStream, DStream} +import org.apache.spark.rdd.RDD +import java.io.File +import org.apache.spark.streaming.kafka.KafkaUtils +import org.apache.spark.storage.StorageLevel +import org.apache.spark.SparkContext._ +import kafka.serializer.StringDecoder +import scala.collection.mutable.Queue +import org.apache.spark.sql.{SchemaRDD, SQLContext} +import org.apache.spark.sql.catalyst.types.StructType +import org.json4s.JValue +import org.json4s.native.JsonMethods._ + +// "struct" holding the number of processed items, runtime in ms and largest seen id +case class ProcessingInformation(itemCount: Long, runtime: Long, maxId: Option[String]) + +class HybridProcessor(sc: SparkContext, + sqlc: SQLContext, + storageLocation: String, + streamLocations: List[String]) + extends LazyLogging { + /* + * We want to do processing of static data first, then continue with + * stream data. Various approaches are thinkable: + * 1. Create a HybridDStream as a subclass of InputDStream, + * 2. create a HybridReceiver as a subclass of Receiver and turn + * it into a DStream by means of StreamingContext.receiveStream(), + * 3. process static and stream data one after another using two + * different StreamingContexts. + * + * A receiver must implement onStart(), onStop() and write the received data + * to Spark's pipeline from a separate thread using the store() method. This + * works nicely with the existing receivers such as KafkaReceiver, but custom + * code is necessary to work with HDFS files and it might be tough to get the + * parallel reading done right. + * + * + * An InputDStream must implement start(), stop(), and compute(time) to generate + * an RDD with data collected in a certain interval. However, there seems to be + * a subtle difference between an InputDStream running on a driver and a + * ReceiverInputDStream that runs a receiver on worker nodes. It seems difficult + * to write one DStream class that gets the parallelism in HDFS and stream + * processing right. + * + * + * + * Therefore we use two different StreamingContexts with one doing processing + * of static data, the other one streaming data. + */ + + require(streamLocations.size <= 1, + "More than one stream location is not supported at the moment.") + + // find the number of workers available to us. + val _runCmd = scala.util.Properties.propOrElse("sun.java.command", "") + val _master = sc.getConf.get("spark.master", "") + val numCoresRe = ".*--executor-cores ([0-9]+) --num-executors ([0-9]+).*".r + val totalNumCores = _runCmd match { + case numCoresRe(coresPerExecutor, numExecutors) => + coresPerExecutor.toInt * numExecutors.toInt + case _ => + 0 + } + if (totalNumCores > 0) + logger.debug("total number of cores: " + totalNumCores) + else + logger.warn("could not extract number of cores from run command: " + _runCmd) + + // define the formats that we can use + val fileRe = """file://(.+)""".r + val hdfsRe = """(hdfs://.+)""".r + val kafkaRe = """kafka://([^/]+)/([^/]+)/([^/]+)$""".r + val dummyRe = """^dummy(.?)""".r + val emptyRe = """^empty(.?)""".r + val validStaticLocations: List[Regex] = emptyRe :: fileRe :: hdfsRe :: Nil + val validStreamLocations: List[Regex] = dummyRe :: kafkaRe :: Nil + + // check if storageLocation matches one of the valid regexes + if (!validStaticLocations.exists(_.findFirstIn(storageLocation).isDefined)) { + throw new IllegalArgumentException(s"'$storageLocation' is not a valid storage " + + "specification") + } + // check if all given streamLocations match one of the valid regexes + val badStreamLocations = streamLocations.filter(loc => + !validStreamLocations.exists(_.findFirstIn(loc).isDefined)) + badStreamLocations collectFirst { + case loc => + throw new IllegalArgumentException(s"'$loc' is not a valid stream specification") + } + + type IdType = String + + // Holds the current streaming context + protected var ssc_ : StreamingContext = null + + def currentStreamingContext() = ssc_ + + // Flag that stores whether static data processing completed successfully + protected var staticProcessingComplete = false + + // Flag that stores whether user stopped data processing manually + protected var userStoppedProcessing = false + + /** + * Start hybrid processing using the given transformation. + * + * @param transform an RDD operation that will be performed on each batch + * @return one function to stop processing and one to get the highest IDs seen so far + */ + def start(transform: RDD[JValue] => RDD[_]): (() => (ProcessingInformation, ProcessingInformation), + () => Option[IdType]) = { + val parseJsonStringIntoOption: (String => Traversable[JValue]) = line => { + val maybeJson = parseOpt(line) + if (maybeJson.isEmpty) { + // logger is not serializable, therefore use println + println("[ERROR] unparseable JSON: " + line) + } + maybeJson + } + val parseAndTransform: RDD[String] => RDD[Unit] = rdd => { + transform(rdd.flatMap(parseJsonStringIntoOption)).map(_ => ()) + } + _start(parseAndTransform) + } + + /** + * Start hybrid processing using the given transformation. + * + * @param transform an RDD operation that will be performed on each batch + * @return one function to stop processing and one to get the highest IDs seen so far + */ + def start(transform: SchemaRDD => SchemaRDD, + schema: Option[StructType]): (() => (ProcessingInformation, ProcessingInformation), + () => Option[IdType]) = { + val parseAndTransform: RDD[String] => RDD[Unit] = rdd => { + // with an empty RDD, we cannot infer the schema (it will raise an exception) + if (rdd.count() > 0) { + // parse with schema or infer if not given + val jsonRdd = schema.map(sqlc.jsonRDD(rdd, _)).getOrElse(sqlc.jsonRDD(rdd, 0.1)) + transform(jsonRdd).map(_ => ()) + } else { + // create an (empty) SchemaRDD + rdd.map(_ => ()) + } + } + _start(parseAndTransform) + } + + /** + * Start hybrid processing using the given transformation. + * + * @param parseAndTransform an RDD operation that will be performed on each batch + * @return one function to stop processing and one to get the highest IDs seen so far + */ + protected def _start(parseAndTransform: RDD[String] => RDD[Unit]): + (() => (ProcessingInformation, ProcessingInformation), () => Option[IdType]) = { + logger.debug("creating StreamingContext for static data") + ssc_ = new StreamingContext(sc, Seconds(2)) + + // this has to match our jubaql_timestamp inserted by fluentd + val extractRe = """.+"jubaql_timestamp": ?"([0-9\-:.T]+)".*""".r + + val extractId: String => IdType = item => { + item match { + case extractRe(idString) => + idString + case _ => + "" + } + } + + // create the static data source + val staticData: DStream[String] = storageLocation match { + /* Notes: + * 1. We have to use fileStream instead of textFileStream because we need + * to pass in newFilesOnly=false. + * 2. The implementation of fileStream will process all existing files + * in the first batch (which will maybe take a very long time). If + * a new file appears during that processing, it will be added to the + * batch of the time when it appeared. It may be worth considering a + * different implementation using a Queue that only enqueues new files + * when all previous processing is done, but we need to closely examine + * the behavior for very long batch processing times before deciding + * on that. + * 3. We have no guarantee about the order of files when using the standard + * FileInputDStream, since it uses o.a.h.f.FileSystem.listStatus() under + * the hood (that is knowledge we should not actually use) and there + * doesn't seem to be any contract about order of files. Our custom + * OrderedFileInputDStream adds that ordering. + * 4. Files that are currently being appended to seem to be read as well + * by the standard FileInputDStream. We do *not* want that, since + * such a file would be marked as "processed" and the next file that + * appears would be picked up, even though we did not process all + * its contents. Therefore we use a custom OrderedFileInputDStream + * that ignores files that received updates recently. + */ + case emptyRe(something) => + val queue: Queue[RDD[String]] = new Queue() + ssc_.queueStream(queue) + case fileRe(filepath) => + val realpath = if (filepath.startsWith("/")) { + filepath + } else { + (new File(".")).getAbsolutePath + "/" + filepath + } + new OrderedFileInputDStream[LongWritable, Text, TextInputFormat](ssc_, + "file://" + realpath, + (path: Path) => true, + false).map(_._2.toString) + case hdfsRe(filepath) => + new OrderedFileInputDStream[LongWritable, Text, TextInputFormat](ssc_, + filepath, + (path: Path) => true, + false).map(_._2.toString) + } + logger.debug("static data DStream: " + staticData) + + // keep track of the maximal ID seen during processing + val maxStaticId = sc.accumulator[Option[IdType]](None)(new MaxOptionAccumulatorParam[IdType]) + val countStatic = sc.accumulator(0L) + val maxStreamId = sc.accumulator[Option[IdType]](None)(new MaxOptionAccumulatorParam[IdType]) + val countStream = sc.accumulator(0L) + + // processing of static data + val repartitionedData = if (_master == "yarn-cluster" && totalNumCores > 0) { + // We repartition by (numExecutors * executorCores) to get just the + // right level of parallelism. + logger.info(s"repartitioning for $totalNumCores workers") + staticData.repartition(totalNumCores) + } else { + logger.debug("not repartitioning") + staticData + } + repartitionedData.map(item => { + // update maximal ID + maxStaticId += Some(extractId(item)) + item + }).transform(parseAndTransform).foreachRDD(rdd => { + val count = rdd.count() + // we count the number of total processed rows (on the driver) + countStatic += count + // stop processing of static data if there are no new files + if (count == 0) { + logger.info(s"processed $count (static) lines, looks like done") + synchronized { + staticProcessingComplete = true + } + } else { + logger.info(s"processed $count (static) lines") + } + }) + + // start first StreamingContext + logger.info("starting static data processing") + val staticStartTime = System.currentTimeMillis() + var staticRunTime = 0L + var streamStartTime = -1L + var streamRunTime = 0L + ssc_.start() + val staticStreamingContext = ssc_ + + // start one thread that waits for static data processing to complete + future { + logger.debug("hello from thread to wait for completion of static processing") + // If *either* the static data processing completed successfully, + // *or* the staticStreamingContext finished for some other reason + // (we measure this by the execution time of awaitTermination(timeout)) + // we stop the streaming context. + val timeToWait = 200L + val logEveryNLoops = 5 + var i = 0 + var staticProcessingStillRunning = true + while (!staticProcessingComplete && staticProcessingStillRunning) { + val timeBeforeWaiting = System.currentTimeMillis() + if (i == logEveryNLoops) { + logger.debug("waiting for static data processing to complete") + i = 0 + } else { + i += 1 + } + staticStreamingContext.awaitTermination(timeToWait) + val timeAfterWaiting = System.currentTimeMillis() + val actuallyWaitedTime = timeAfterWaiting - timeBeforeWaiting + staticProcessingStillRunning = actuallyWaitedTime >= timeToWait + } + if (staticProcessingComplete) { + logger.info("static data processing completed successfully, " + + "stopping StreamingContext") + } else { + logger.warn("static data processing ended, but did not complete") + } + staticStreamingContext.stop(false, true) + logger.debug("bye from thread to wait for completion of static processing") + } onFailure { + case error: Throwable => + logger.error("Error while waiting for static processing end", error) + } + + // start one thread that waits for the first StreamingContext to terminate + future { + // NB. This is a separate thread. In functions that will be serialized, + // you cannot necessarily use variables from outside this thread. + // Also see . + val localExtractId = extractId + val localCountStream = countStream + val localMaxStreamId = maxStreamId + logger.debug("hello from thread to start stream processing") + staticStreamingContext.awaitTermination() + // If we arrive here, the static processing is done, either by failure + // or user termination or because all processing was completed. We want + // to continue with real stream processing only if the static processing + // was completed successfully. + val largestStaticItemId = maxStaticId.value + staticRunTime = System.currentTimeMillis() - staticStartTime + logger.debug("static processing ended after %d items and %s ms, largest seen ID: %s".format( + countStatic.value, staticRunTime, largestStaticItemId)) + if (staticProcessingComplete && !userStoppedProcessing) { + logger.info("static processing completed successfully, setting up stream") + streamLocations match { + case streamLocation :: Nil => + // set up stream processing + logger.debug("creating StreamingContext for stream data") + ssc_ = new StreamingContext(sc, Seconds(2)) + val allStreamData: DStream[String] = streamLocation match { + case dummyRe(nothing) => + // dummy JSON data emitted over and over + val dummyData = sc.parallelize("{\"id\": 5}" :: "{\"id\": 6}" :: + "{\"id\": 7}" :: Nil) + new ConstantInputDStream(ssc_, dummyData) + case kafkaRe(zookeeper, topics, groupId) => + // connect to the given Kafka instance and receive data + val kafkaParams = Map[String, String]( + "zookeeper.connect" -> zookeeper, "group.id" -> groupId, + "auto.offset.reset" -> "smallest") + KafkaUtils.createStream[String, String, + StringDecoder, StringDecoder](ssc_, kafkaParams, + Map(topics -> 2), + // With MEMORY_ONLY, we seem to run out of memory quickly + // when processing is slow. Much worse: There is no space + // left for broadcast variables, so we cannot communicate + // our "runState = false" information. + StorageLevel.DISK_ONLY).map(_._2) + } + val streamData = (largestStaticItemId match { + case Some(largestId) => + // only process items with a strictly larger id than what we + // have seen so far + logger.info("filtering for items with an id larger than " + largestId) + allStreamData.filter(item => { + localExtractId(item) > largestId + }) + case None => + // don't do any ID filtering if there is no "largest id" + logger.info("did not see any items in static processing, " + + "processing whole stream") + allStreamData + }).map(item => { + // remember the largest seen ID + localMaxStreamId += Some(localExtractId(item)) + item + }) + logger.debug("stream data DStream: " + streamData) + streamData.transform(parseAndTransform).foreachRDD(rdd => { + // this `count` is *necessary* to trigger the (lazy) transformation! + val count = rdd.count() + // we count the number of total processed rows (on the driver) + localCountStream += count + logger.info(s"processed $count (stream) lines") + }) + // start stream processing + synchronized { + if (userStoppedProcessing) { + logger.info("processing was stopped by user during stream setup, " + + "not starting") + } else { + logger.info("starting stream processing") + streamStartTime = System.currentTimeMillis() + ssc_.start() + } + } + case Nil => + logger.info("not starting stream processing " + + "(no stream source given)") + case _ => + logger.error("not starting stream processing " + + "(multiple streams not implemented)") + } + } else if (staticProcessingComplete && userStoppedProcessing) { + logger.info("static processing was stopped by user, " + + "not setting up stream") + } else { + logger.warn("static processing did not complete successfully, " + + "not setting up stream") + } + logger.debug("bye from thread to start stream processing") + } onFailure { + case error: Throwable => + logger.error("Error while setting up stream processing", error) + } + + // return a function to stop the data processing + (() => { + logger.info("got shutdown request from user") + synchronized { + userStoppedProcessing = true + } + logger.debug("now stopping the StreamingContext") + currentStreamingContext.stop(false, true) + logger.debug("done stopping the StreamingContext") + // if stream processing was not started or there was a runtime already + // computed, we don't update the runtime + if (streamStartTime > 0 && streamRunTime == 0) { + streamRunTime = System.currentTimeMillis() - streamStartTime + } + logger.info(("processed %s items in %s ms (static) and %s items in " + + "%s ms (stream)").format(countStatic.value, staticRunTime, + countStream.value, streamRunTime)) + (ProcessingInformation(countStatic.value, staticRunTime, maxStaticId.value), + ProcessingInformation(countStream.value, streamRunTime, maxStreamId.value)) + }, () => maxStaticId.value) + } + + + def awaitTermination() = { + ssc_.awaitTermination() + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala new file mode 100644 index 0000000..eb36315 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala @@ -0,0 +1,47 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + + +sealed abstract trait JubaQLAST + +case class +CreateDatasource(sourceName: String, + columns: List[(String, String)], + sinkStorage: String, + sinkStreams: List[String]) extends JubaQLAST + +case class +CreateModel(algorithm: String, + modelName: String, + configJson: String, + specifier: List[(String, List[String])]) extends JubaQLAST { + override def toString: String = "CreateModel(%s,%s,%s,%s)".format( + algorithm, + modelName, + if (configJson.size > 13) configJson.take(5) + "..." + configJson.takeRight(5) + else configJson, + specifier + ) +} + +case class Update(modelName: String, rpcName: String, source: String) extends JubaQLAST + +case class Analyze(modelName: String, rpcName: String, data: String) extends JubaQLAST + +case class Shutdown() extends JubaQLAST + +case class StopProcessing() extends JubaQLAST diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala new file mode 100644 index 0000000..64c9a5d --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala @@ -0,0 +1,176 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.apache.spark.sql.catalyst.SqlParser +import org.apache.spark.sql.catalyst.plans.logical._ +import com.typesafe.scalalogging.slf4j.LazyLogging + +class JubaQLParser extends SqlParser with LazyLogging { + + protected lazy val CREATE = Keyword("CREATE") + protected lazy val DATASOURCE = Keyword("DATASOURCE") + protected lazy val MODEL = Keyword("MODEL") + protected lazy val ANOMALY = Keyword("ANOMALY") + protected lazy val CLASSIFIER = Keyword("CLASSIFIER") + protected lazy val RECOMMENDER = Keyword("RECOMMENDER") + protected lazy val WITH = Keyword("WITH") + protected lazy val UPDATE = Keyword("UPDATE") + protected lazy val ANALYZE = Keyword("ANALYZE") + protected lazy val USING = Keyword("USING") + protected lazy val DATA = Keyword("DATA") + protected lazy val STORAGE = Keyword("STORAGE") + protected lazy val STREAM = Keyword("STREAM") + protected lazy val config = Keyword("config") + protected lazy val numeric = Keyword("numeric") + protected lazy val string = Keyword("string") + protected lazy val boolean = Keyword("boolean") + protected lazy val SHUTDOWN = Keyword("SHUTDOWN") + protected lazy val STOP = Keyword("STOP") + protected lazy val PROCESSING = Keyword("PROCESSING") + + // column_name column_type + protected lazy val stringPairs: Parser[(String, String)] = { + ident ~ (numeric | string | boolean) ^^ { + case x ~ y => (x, y) + } + } + + protected lazy val stream: Parser[String] = { + STREAM ~ ":" ~> stringLit ^^ { + case url => url + } + } + protected lazy val streamList: Parser[List[String]] = { + "," ~> rep1sep(stream, ",") ^^ { + case rep => rep + } + } + + // CREATE DATASOURCE source_name ( column_name data_type, [...]) FROM sink_id + protected lazy val createDatasource: Parser[JubaQLAST] = { + CREATE ~ DATASOURCE ~> ident ~ opt("(" ~ rep1sep(stringPairs, ",") ~ ")") ~ + FROM ~ "(" ~ STORAGE ~ ":" ~ stringLit ~ opt(streamList) <~ ")" ^^ { + case sourceName ~ rep ~ _ /*FROM*/ ~ _ ~ _ /*STORAGE*/ ~ _ ~ storage ~ streams => + rep match { + case Some(r) => + CreateDatasource(sourceName, r._1._2, storage, streams.getOrElse(List[String]())) + case None => + CreateDatasource(sourceName, List(), storage, streams.getOrElse(List[String]())) + } + } + } + + protected lazy val jubatusAlgorithm: Parser[String] = { + (ANOMALY | CLASSIFIER | RECOMMENDER) ^^ { + case x => x + } + } + + protected lazy val createWith: Parser[(String, List[String])] = { + ident ~ ":" ~ stringLit ^^ { + case key ~ _ ~ value => + (key, List(value)) + } | + ident ~ ":" ~ "[" ~ rep1sep(stringLit, ",") <~ "]" ^^ { + case key ~ _ ~ _ ~ values => + (key, values) + } + } + + // CREATE algorithm_name MODEL jubatus_name WITH config = "json string" + protected lazy val createModel: Parser[JubaQLAST] = { + CREATE ~> jubatusAlgorithm ~ MODEL ~ ident ~ WITH ~ "(" ~ opt(rep1sep(createWith, ",")) ~ ")" ~ "config" ~ "=" ~ stringLit ^^ { + case algorithm ~ _ ~ modelName ~ _ /*with*/ ~ _ ~ cwith ~ _ ~ _ /*config*/ ~ _ ~ config => + CreateModel(algorithm, modelName, config, cwith.getOrElse(List[(String, List[String])]())) + } + } + + // This select copied from SqlParser, and removed `from` clause. + protected lazy val jubaqlSelect: Parser[LogicalPlan] = + SELECT ~> opt(DISTINCT) ~ projections ~ + opt(filter) ~ + opt(grouping) ~ + opt(having) ~ + opt(orderBy) ~ + opt(limit) <~ opt(";") ^^ { + case d ~ p ~ f ~ g ~ h ~ o ~ l => + val base = NoRelation + val withFilter = f.map(f => Filter(f, base)).getOrElse(base) + val withProjection = + g.map { + g => + Aggregate(g, assignAliases(p), withFilter) + }.getOrElse(Project(assignAliases(p), withFilter)) + val withDistinct = d.map(_ => Distinct(withProjection)).getOrElse(withProjection) + val withHaving = h.map(h => Filter(h, withDistinct)).getOrElse(withDistinct) + val withOrder = o.map(o => Sort(o, withHaving)).getOrElse(withHaving) + val withLimit = l.map { + l => Limit(l, withOrder) + }.getOrElse(withOrder) + withLimit + } + + protected lazy val update: Parser[JubaQLAST] = { + UPDATE ~ MODEL ~> ident ~ USING ~ ident ~ FROM ~ ident ^^ { + case modelName ~ _ ~ rpcName ~ _ ~ source => + Update(modelName, rpcName, source) + } + } + + protected lazy val analyze: Parser[JubaQLAST] = { + ANALYZE ~> stringLit ~ BY ~ MODEL ~ ident ~ USING ~ ident ^^ { + case data ~ _ ~ _ ~ modelName ~ _ ~ rpc => + Analyze(modelName, rpc, data) + } + } + + protected lazy val shutdown: Parser[JubaQLAST] = { + SHUTDOWN ^^ { + case _ => + Shutdown() + } + } + + protected lazy val stopProcessing: Parser[JubaQLAST] = { + STOP ~> PROCESSING ^^ { + case _ => + StopProcessing() + } + } + + protected lazy val jubaQLQuery: Parser[JubaQLAST] = { + createDatasource | + createModel | + update | + analyze | + shutdown | + stopProcessing + } + + // note: apply cannot override incompatible type with parent class + //override def apply(input: String): Option[JubaQLAST] = { + def parse(input: String): Option[JubaQLAST] = { + phrase(jubaQLQuery)(new lexical.Scanner(input)) match { + case Success(r, q) => + logger.debug(s"successfully parsed '$input' into $r") + Option(r) + case x => + logger.warn(s"failed to parse '$input' as JubaQL") + None + } + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala new file mode 100644 index 0000000..67a5c16 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala @@ -0,0 +1,186 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import scala.sys.process._ +import java.net.InetAddress +import com.typesafe.scalalogging.slf4j.LazyLogging +import org.apache.spark.SparkContext +import com.twitter.finagle.{Http, Service} +import com.twitter.util.Await +import org.jboss.netty.handler.codec.http._ +import sun.misc.{SignalHandler, Signal} + +object JubaQLProcessor extends LazyLogging { + protected var portHolder: Option[Int] = None + + /** Main function to start the JubaQL processor application. */ + def main(args: Array[String]) { + // check run.mode property (default: development mode). + val runModeString = scala.util.Properties.propOrElse("run.mode", "") + val runMode: RunMode = runModeString match { + case "" | "development" => + RunMode.Development + case "production" => + // Require that zookeeper is given in production mode in + // the form "host:port,host:port,...". If port is not given, take 2181. + val zookeeperString = scala.util.Properties.propOrElse("jubaql.zookeeper", "") + if (zookeeperString.trim.isEmpty) { + logger.error("system property jubaql.zookeeper must be given " + + "in production mode (comma-separated host:port list)") + System.exit(1) + } + val hostPortRe = "^([a-zA-Z0-9.]*[a-zA-Z0-9])(:[0-9]+)?$".r + val hosts: Seq[(String, Int)] = zookeeperString.split(',').map(_ match { + case hostPortRe(host, portWithColon) => + if (portWithColon == null) + (host, 2181) + else + (host, portWithColon.stripPrefix(":").toInt) + case x => + logger.error(s"'$zookeeperString' is not a valid jubaql.zookeeper string") + System.exit(1) + null + }) + RunMode.Production(hosts.toList) + case other => + logger.error(s"bad run.mode property: $other") + System.exit(1) + RunMode.Development // for type of the match expression + } + logger.debug(s"Starting JubaQLProcessor in run mode $runMode") + + // When run through spark-submit, the Java system property "spark.master" + // will contain the master passed to spark-submit and we *must* use the + // same; otherwise use "local[3]". + val master = scala.util.Properties.propOrElse("spark.master", "local[3]") + + // start Spark + logger.info("JubaQLProcessor Spark backend starting") + val sc = new SparkContext(master, "JubaQL Processor") + + // start HTTP interface + val service: Service[HttpRequest, HttpResponse] = new JubaQLService(sc, runMode) + val errorHandler = new HandleExceptions + logger.info("JubaQLProcessor HTTP server starting") + val server = Http.serve(":*", errorHandler andThen service) + var isRegistered = false + + val address = server.boundAddress + if (!address.isInstanceOf[java.net.InetSocketAddress]) { + logger.error("current implementation of finagle server does not provide boundAdress as InetSocketAddress") + System.exit(1) + } + val inetAddress = address.asInstanceOf[java.net.InetSocketAddress] + val port = inetAddress.getPort + portHolder = Some(port) + logger.info(s"HTTP server listening on port $port") + + // Create a helper to do (un)registration from gateway, if URL is given. + val regHandler = args.headOption.map(new RegistrationHandler(_)) + + val shutDownNicely = new SignalHandler() { + def handle(sig: Signal) { + logger.info("received signal, shutting down") + // unregister before shutting down the server so that we will not + // receive any queries from the gateway after server was stopped + if (isRegistered) { + unregister(regHandler) + isRegistered = false + } + Await.result(server.close()) + } + } + + // Ctrl+C + Signal.handle(new Signal("INT"), shutDownNicely) + // kill + Signal.handle(new Signal("TERM"), shutDownNicely) + + // register -- will exit(1) on failure! + isRegistered = register(regHandler) + + // main loop + Await.ready(server) + logger.info("JubaQLProcessor HTTP server stopped") + sc.stop() + logger.info("JubaQLProcessor Spark backend stopped") + + // If server was stopped by a signal, unregistration was already done, so + // isRegistered is false. Otherwise (which case is that??), unregister now. + if (isRegistered) + unregister(regHandler) + + logger.info("JubaQLProcessor shut down successfully") + // If (and only if) we used dispatch to (un)register with an HTTP server, the + // program will not exit here because there are still threads running. + // Therefore we add a manual system exit as the last line. + val THREAD_DEBUG = false + if (THREAD_DEBUG) { + val threadDesc = Thread.getAllStackTraces().keySet().toArray().map(tObj => { + val t = tObj.asInstanceOf[Thread] + t.toString + " (daemon: " + t.isDaemon + ", " + t.getState + ")" + }) + logger.debug("threads still running:\n" + threadDesc.mkString("\n")) + } + System.exit(0) + } + + def getListeningAddress: (InetAddress, Int) = { + // InetAddress.getLocalHost will probably/maybe lead to unforeseeable + // behavior with multiple interfaces. Instead we use the IP address + // that our own hostname resolves into. (This is a requirement also + // imposed by CDH, so we can assume it is a reasonable choice.) + val host = InetAddress.getByName("hostname".!!.trim) + portHolder match { + case Some(port) => + (host, port) + case _ => + throw new UninitializedError() + } + } + + protected def register(regHandler: Option[RegistrationHandler]): Boolean = { + regHandler.map(r => { + logger.info("registering with JubaQLGateway at " + r.registerUrl) + r.register match { + case Left(exception) => + exception match { + case e: IllegalArgumentException => + logger.error("invalid URL provided: " + e.getMessage) + case e: Throwable => + logger.error("registration failed: " + e.toString) + } + System.exit(1) + case Right(_) => + logger.info("registered successfully") + } + true // we will only come here if registration was successful + }).getOrElse(false) + } + + protected def unregister(regHandler: Option[RegistrationHandler]) = { + regHandler.foreach(r => { + logger.info("unregistering from JubaQLGateway at " + r.registerUrl) + r.unregister match { + case Left(exception) => + logger.error("unregistration failed: " + exception.getMessage) + case Right(_) => + logger.info("unregistered successfully") + } + }) + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala new file mode 100644 index 0000000..184ab5f --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala @@ -0,0 +1,626 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import java.net.InetAddress +import java.util.concurrent.ConcurrentHashMap + +import com.twitter.finagle.Service +import com.twitter.util.{Future => TwFuture, Promise => TwPromise} +import com.typesafe.scalalogging.slf4j.LazyLogging +import io.netty.util.CharsetUtil +import us.jubat.jubaql_server.processor.json.{AnomalyScore, ClassifierPrediction, ClassifierResult, DatumResult} +import us.jubat.jubaql_server.processor.updater.{Anomaly, Classifier, Recommender} +import org.apache.spark.SparkContext +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.catalyst.types._ +import org.jboss.netty.buffer.ChannelBuffers +import org.jboss.netty.handler.codec.http._ +import org.json4s._ +import org.json4s.native.{JsonMethods, Serialization} +import us.jubat.anomaly.AnomalyClient +import us.jubat.classifier.ClassifierClient +import us.jubat.common.Datum +import us.jubat.recommender.RecommenderClient +import us.jubat.yarn.client.{JubatusYarnApplicationStatus, JubatusYarnApplication, Resource} +import us.jubat.yarn.common.{LearningMachineType, Location} + +import scala.collection._ +import scala.collection.convert.decorateAsScala._ +import scala.concurrent.{Future => ScFuture, Promise => ScPromise, Await => ScAwait, SyncVar} +import scala.concurrent.duration._ +import scala.concurrent.ExecutionContext.Implicits.global +import scala.util.{Failure, Random, Success, Try} +import sun.misc.Signal + +class JubaQLService(sc: SparkContext, runMode: RunMode) + extends Service[HttpRequest, HttpResponse] + with LazyLogging { + val random = new Random() + val parser = new JubaQLParser() + // alias name for parser is needed to override SQLContext's parser + val parserAlias = parser + val sqlc = new SQLContext(sc) { + override val parser = parserAlias + } + val sources: concurrent.Map[String, (HybridProcessor, StructType)] = + new ConcurrentHashMap[String, (HybridProcessor, StructType)]().asScala + val models: concurrent.Map[String, (JubatusYarnApplication, CreateModel, LearningMachineType)] = + new ConcurrentHashMap[String, (JubatusYarnApplication, CreateModel, LearningMachineType)]().asScala + val startedJubatusInstances: concurrent.Map[String, ScFuture[JubatusYarnApplication]] = + new ConcurrentHashMap[String, ScFuture[JubatusYarnApplication]]().asScala + + // set this flag to `false` to prevent the HTTP server from processing queries + protected val isAcceptingQueries: SyncVar[Boolean] = new SyncVar() + isAcceptingQueries.put(true) + + // set this flag to `true` to signal to executors they should stop processing + protected val executorsShouldFinishProcessing: SyncVar[Boolean] = new SyncVar() + executorsShouldFinishProcessing.put(false) + + // store a function to stop the UPDATE process (if one is running) + protected var stopUpdateFunc: Option[() => (ProcessingInformation, ProcessingInformation)] = None + + /** Sets up processing for an incoming request; returns Future of the result. + */ + override def apply(request: HttpRequest): TwFuture[HttpResponse] = { + /* + * If no processing is required, a Future.value() can be used to + * immediately return a value. Otherwise, create a Future with + * the value to be processed (i.e., a Promise), create a processing + * pipeline using map() and flatMap() and return a Future of the + * response. Processing will then happen in a thread pool and newly + * arriving requests will be enqueued. + */ + val requestId = "req#" + Math.abs(random.nextInt) + logger.info("[%s] received %s request to %s".format(requestId, + request.getMethod.getName, request.getUri)) + + request.getUri match { + case "/status" => + val resp = new DefaultHttpResponse(HttpVersion.HTTP_1_1, + HttpResponseStatus.OK) + if (executorsShouldFinishProcessing.get == true) + resp.setContent(ChannelBuffers.copiedBuffer("shutdown", CharsetUtil.UTF_8)) + else + resp.setContent(ChannelBuffers.copiedBuffer("running", CharsetUtil.UTF_8)) + TwFuture.value(resp) + + // if we get POSTed a statement, process it + case "/jubaql" if request.getMethod == HttpMethod.POST => + val body = request.getContent.toString(CharsetUtil.UTF_8) + logger.debug("[%s] request body: %s".format(requestId, body)) + + // create an empty promise and create the processing pipeline + val command = new TwPromise[String] + // TODO: use Either or Future semantics to transport success/failure information + val result: TwFuture[Option[String]] = command.map(parseJson).map(_.flatMap(takeAction)) + // now actually put the received command in the promise, + // triggering the processing + command.setValue(body) + + // create an HttpResponse based on the result + val responseFuture = result.map(res => { + // pick HTTP response code and body + val (resp, bodyJson) = res match { + case Some(msg) => + (new DefaultHttpResponse(HttpVersion.HTTP_1_1, + HttpResponseStatus.OK), + // msg may already be a JSON string + // TODO: get this type-safe + if (msg.startsWith("{") || msg.startsWith("[")) + "{\"result\": %s}".format(msg) + else + "{\"result\": \"%s\"}".format(msg)) + case _ => + (new DefaultHttpResponse(HttpVersion.HTTP_1_1, + HttpResponseStatus.INTERNAL_SERVER_ERROR), + "{\"result\": \"error\"}") + } + // add header and body + resp.addHeader("Content-Type", "application/json; charset=utf-8") + resp.setContent(ChannelBuffers.copiedBuffer(bodyJson, CharsetUtil.UTF_8)) + logger.info("[%s] request processing complete => %s".format(requestId, + resp.getStatus.getCode)) + resp + }) + logger.debug("[%s] request processing prepared".format(requestId)) + responseFuture + + // return 404 in any other case + case _ => + logger.info("[%s] => 404".format(requestId)) + val response = new DefaultHttpResponse(HttpVersion.HTTP_1_1, + HttpResponseStatus.NOT_FOUND) + TwFuture.value(response) + } + } + + protected def parseJson(in: String): Option[JubaQLAST] = { + // parse string and extract the "query" field + JsonMethods.parseOpt(in).map(_ \ "query") match { + case Some(JString(queryString)) => + try { + parser.parse(queryString) + } catch { + case e: Throwable => + logger.error(s"unable to parse queryString '$queryString': " + e.getMessage) + None + } + case Some(other) => + logger.warn(s"received JSON '$in' did not contain a query string") + None + case None => + logger.warn(s"received string '$in' was not valid JSON") + None + } + } + + protected def takeAction(ast: JubaQLAST): Option[String] = { + ast match { + case anything if isAcceptingQueries.get == false => + logger.warn(s"received $anything while shutting down, not taking action") + // propagate message to client + None + + case cd: CreateDatasource => + val processor = new HybridProcessor(sc, sqlc, cd.sinkStorage, cd.sinkStreams) + // TODO schema must be optional + val schema = StructType(cd.columns.map { + case (colName, dataType) => { + StructField(colName, dataType.toLowerCase match { + case "numeric" => LongType + case "string" => StringType + case "boolean" => BooleanType + case _ => ??? + }, false) + } + }) + sources.put(cd.sourceName, (processor, schema)) + Some("CREATE DATASOURCE") + + case cm: CreateModel => + val jubaType: LearningMachineType = cm.algorithm match { + case "CLASSIFIER" => + LearningMachineType.Classifier + case "ANOMALY" => + LearningMachineType.Anomaly + case "RECOMMENDER" => + LearningMachineType.Recommender + } + // TODO: location, resource + val resource = Resource(priority = 0, memory = 256, virtualCores = 1) + val juba: ScFuture[JubatusYarnApplication] = runMode match { + case RunMode.Production(zookeeper) => + val location = zookeeper.map { + case (host, port) => Location(InetAddress.getByName(host), port) + } + JubatusYarnApplication.start(cm.modelName, jubaType, location, cm.configJson, resource, 2) + case RunMode.Development => + LocalJubatusApplication.start(cm.modelName, jubaType, cm.configJson) + } + + // we keep a reference to the started instance so we can always check its status + // and wait for it to come up if necessary + val startedInstance = ScPromise[JubatusYarnApplication]() + startedJubatusInstances.put(cm.modelName, startedInstance.future) + juba onComplete { + case Success(j) => + logger.info("CREATE MODEL succeeded") + models.put(cm.modelName, (j, cm, jubaType)) + startedInstance.completeWith(juba) + case Failure(t) => + logger.info("CREATE MODEL failed") + t.printStackTrace() + startedInstance.completeWith(juba) + } + Some("CREATE MODEL (started)") + + case update: Update => + var model: JubatusYarnApplication = null + var jubaType: LearningMachineType = null + var cm: CreateModel = null + // wait until model is available (when Jubatus is started) or timeout + startedJubatusInstances.get(update.modelName).foreach(jubaFut => { + if (!jubaFut.isCompleted) { + logger.debug("waiting for model %s to come up".format(update.modelName)) + ScAwait.ready(jubaFut, 1 minute) + } + }) + val maybeModel = models.get(update.modelName) + maybeModel match { + case Some((s, c, ty)) => (s, c, ty) + model = s + cm = c + jubaType = ty + case None => + // TODO: error message + logger.error("model not found") + return None + } + + // Note: Theoretically it would as well be possible to address the jubatus + // instances directly by looking at `model.jubatusServers`. + val jubaHost = model.jubatusProxy.hostAddress + val jubaPort = model.jubatusProxy.port + val trainSpecifier = cm.specifier.toMap + val keys = trainSpecifier.get("datum") match { + case Some(list) if list.nonEmpty => list + case _ => ??? // TODO: throw exception. datum not specified + } + + val updater = jubaType match { + case LearningMachineType.Anomaly if update.rpcName == "add" => + new Anomaly(jubaHost, jubaPort, cm, keys) + + case LearningMachineType.Classifier if update.rpcName == "train" => + val label = trainSpecifier.get("label") match { + case Some(la :: Nil) => la + case _ => ??? // TODO: throw exception + } + new Classifier(jubaHost, jubaPort, cm, keys) + + case LearningMachineType.Recommender if update.rpcName == "update_row" => + val id = trainSpecifier.get("id") match { + case Some(id :: Nil) => id + case _ => ??? // TODO: throw exception + } + new Recommender(jubaHost, jubaPort, cm, id, keys) + + case lmt => + logger.error("no matching learning machine for " + lmt) + return None + } + + // Start to process RDD + try sources.get(update.source) match { + case Some((rddProcessor, schema)) => + logger.info("UPDATE started") + val (host, port) = JubaQLProcessor.getListeningAddress + val statusUrl = "http://%s:%s/status".format(host.getHostAddress, port) + val stopFun = rddProcessor.start(rddjson => { + rddjson.mapPartitions(updater(_, statusUrl)) + })._1 + // store the function to stop processing + stopUpdateFunc = Some(() => stopFun()) + Some("UPDATE MODEL") + + case None => + // TODO: error message + logger.error("source '%s' not found".format(update.source)) + None + } + + case ana: Analyze => + queryAnalyze(ana) match { + case Some(toReturn) => + Some(toReturn) + case None => + logger.error("no ANALYZE result for " + ana) + None + } + + case s: Shutdown => + // first set a flag to stop further query processing + isAcceptingQueries.set(false) // NB. put() has different semantics + // stop stream processing + val procStats = stopUpdateFunc match { + case Some(func) => + Some(stopStreamProcessing(func)) + case _ => + logger.info("apparently there was no stream processing running") + None + } + // Shut down all Jubatus instances. First, loop over all Jubatus instances + // ever started, independent of complete (successful or failed) or still + // starting: + val stoppedJubaFut: Iterable[ScFuture[Unit]] = startedJubatusInstances.map { + case (modelName, jubaFut) => + logger.debug(s"scheduling shutdown for model $modelName") + // If the startup failed, no need to shutdown. For all non-failed + // instances (still starting or started successfully), we schedule + // a shutdown using map(). + jubaFut.map(juba => shutdownJubatus(modelName, juba)) + } + // now convert a list of futures into a future of list and wait until completion + logger.info("waiting for all Jubatus instances to shut down") + ScAwait.ready(ScFuture.sequence(stoppedJubaFut), 1 minute) + // send a KILL signal to us to trigger Spark and Finagle shutdown + Signal.raise(new Signal("TERM")) + procStats match { + case Some((staticInfo, streamInfo)) => + Some("SHUTDOWN (processing time: %s ms/%s ms)".format( + staticInfo.runtime, streamInfo.runtime)) + case _ => + Some("SHUTDOWN") + } + + case sp: StopProcessing => + stopUpdateFunc match { + case Some(func) => + val (staticInfo, streamInfo) = stopStreamProcessing(func) + stopUpdateFunc = None + Some("STOP PROCESSING (processing time: %s ms/%s ms)".format( + staticInfo.runtime, streamInfo.runtime)) + case _ => + logger.warn("apparently there was no stream processing running") + None + } + + case other => + logger.error("no handler for " + other) + None + } + } + + protected def stopStreamProcessing(stopFun: () => (ProcessingInformation, ProcessingInformation)): + (ProcessingInformation, ProcessingInformation) = { + logger.info("stopping stream processing") + // tell executors they should stop their processing + executorsShouldFinishProcessing.set(true) // NB. put() has different semantics + // the following call will block until processing is done completely + val (staticInfo, streamInfo) = stopFun() + logger.info("shut down successfully; processed %s/%s items".format( + staticInfo.itemCount, streamInfo.itemCount + )) + (staticInfo, streamInfo) + } + + protected def shutdownJubatus(modelName: String, app: JubatusYarnApplication) = { + logger.info(s"shutting down model: $modelName") + try { + app.stop() + logger.info(s"model $modelName shut down successfully") + } catch { + case e: Throwable => + logger.error(s"failed to shut down $modelName: " + e.getMessage) + } + } + + protected def extractDatum(keys: List[String], data: String): Datum = { + extractDatum(keys, JsonMethods.parse(data)) + } + + protected def extractDatum(keys: List[String], jvalue: JValue): Datum = { + // filter unused filed + val filtered = jvalue.filterField { + case JField(key, _) => keys.indexOf(key) >= 0 + case _ => false + } + + var datum = new Datum + filtered.foreach({ + j => + val key = j._1 + j._2 match { + case JInt(v) => + datum.addNumber(key, v.toDouble) + case JDouble(v) => + datum.addNumber(key, v) + case JString(v) => + datum.addString(key, v) + case _ => + } + j + }) + return datum + } + + + protected def queryAnalyze(ana: Analyze): Option[String] = { + def datumToJson(datum: Datum): DatumResult = { + DatumResult( + datum.getStringValues().asScala.map(v => (v.key, v.value)).toMap, + datum.getNumValues().asScala.map(v => (v.key, v.value)).toMap + ) + } + models.get(ana.modelName) match { + case Some((s, cm, LearningMachineType.Anomaly)) if ana.rpcName == "calc_score" => + val host = s.jubatusProxy.hostAddress + val port = s.jubatusProxy.port + val keys = cm.specifier.toMap.get("datum") match { + case Some(list) if list.nonEmpty => list + case _ => ??? // TODO: throw exception. datum not specified + } + var datum = extractDatum(keys, ana.data) + val anomaly = new AnomalyClient(host, port, ana.modelName, 5) + try { + val score = AnomalyScore(anomaly.calcScore(datum)) + implicit val formats = DefaultFormats + return Some(Serialization.write(score)) + } finally { + anomaly.getClient.close() + } + + case Some((s, cm, LearningMachineType.Classifier)) if ana.rpcName == "classify" => + val host = s.jubatusProxy.hostAddress + val port = s.jubatusProxy.port + val keys = cm.specifier.toMap.get("datum") match { + case Some(list) if list.nonEmpty => list + case _ => ??? // TODO: throw exception. datum not specified + } + var datum = extractDatum(keys, ana.data) + val data = new java.util.LinkedList[Datum]() + data.add(datum) + val classifier = new ClassifierClient(host, port, ana.modelName, 5) + try { + val res = classifier.classify(data) + if (res.size() >= 1) { + // return in json format + val retValue = ClassifierResult(res.get(0).asScala.map({ + f => ClassifierPrediction(f.label, f.score) + }).toList) + implicit val formats = DefaultFormats + return Some(Serialization.write(retValue)) + } else { + // TODO: return error in json + } + } finally { + classifier.getClient().close() + } + case Some((s, cm, LearningMachineType.Recommender)) if (ana.rpcName == "complete_row_from_id" || + ana.rpcName == "complete_row_from_datum") => + val host = s.jubatusProxy.hostAddress + val port = s.jubatusProxy.port + ana.rpcName match { + case "complete_row_from_id" => + val recommender = new RecommenderClient(host, port, ana.modelName, 5) + try { + val retDatum = datumToJson(recommender.completeRowFromId(ana.data)) + + implicit val formats = DefaultFormats + return Some(Serialization.write(retDatum)) + } finally { + recommender.getClient().close() + } + + case "complete_row_from_datum" => + val keys = cm.specifier.toMap.get("datum") match { + case Some(list) if list.nonEmpty => list + case _ => ??? // TODO: throw exception. datum not specified + } + var datum = extractDatum(keys, ana.data) + val recommender = new RecommenderClient(host, port, ana.modelName, 5) + + try { + val retDatum = datumToJson(recommender.completeRowFromDatum(datum)) + + implicit val formats = DefaultFormats + return Some(Serialization.write(retDatum)) + } finally { + recommender.getClient().close() + } + case _ => + } + case _ => + // error + None + } + None + } +} + +sealed trait RunMode + +object RunMode { + case class Production(zookeeper: List[(String, Int)]) extends RunMode + case object Development extends RunMode +} + +object LocalJubatusApplication extends LazyLogging { + def start(aLearningMachineName: String, + aLearningMachineType: LearningMachineType, + aConfigString: String): scala.concurrent.Future[us.jubat.yarn.client.JubatusYarnApplication] = { + scala.concurrent.Future { + val jubaCmdName = aLearningMachineType match { + case LearningMachineType.Anomaly => + "jubaanomaly" + case LearningMachineType.Classifier => + "jubaclassifier" + case LearningMachineType.Recommender => + "jubarecommender" + } + + logger.info(s"start LocalJubatusApplication (name: $aLearningMachineName, $jubaCmdName)") + val namedPipePath = s"/tmp/${aLearningMachineName.trim}" + val runtime = Runtime.getRuntime + try { + val exitStatus = mkfifo(namedPipePath, runtime) + if (exitStatus != 0) { + logger.error(f"failed to create a named pipe at $namedPipePath%s with exit status $exitStatus") + } + } catch { + case e: java.io.IOException => + logger.error(s"failed to create a named pipe at $namedPipePath") + System.exit(1) + } + + val namedPipe = new java.io.File(namedPipePath) + try { + val jubatusProcess = runtime.exec(s"$jubaCmdName -f $namedPipePath") + handleSubProcessOutput(jubatusProcess.getInputStream, System.out, jubaCmdName) + handleSubProcessOutput(jubatusProcess.getErrorStream, System.err, jubaCmdName) + val namedPipeWriter = new java.io.PrintWriter(namedPipe) + try { + namedPipeWriter.write(aConfigString) + } finally { + namedPipeWriter.close() + } + + new LocalJubatusApplication(jubatusProcess, aLearningMachineName, jubaCmdName) + } finally { + namedPipe.delete() + } + } + } + + def mkfifo(path: String, runtime: Runtime): Int = { + val mkfifoProcess = runtime.exec(Array("mkfifo", path)) + mkfifoProcess.waitFor() + } + + private def handleSubProcessOutput(in: java.io.InputStream, + out: java.io.PrintStream, + jubaCmdName: String) { + import java.io._ + val thread = new Thread { + override def run { + val reader = new BufferedReader(new InputStreamReader(in)) + try { + var line = reader.readLine() + while (line != null) { + out.println(s"[$jubaCmdName] $line") + line = reader.readLine() + } + } catch { + case e: IOException => + logger.warn(s"caught IOException in a subprocess handler: ${e.getMessage}") + } + // Never close out here. + } + } + thread.setDaemon(true) + thread.start() + } +} + +// LocalJubatusApplication is not a JubatusYarnApplication, but extends JubatusYarnApplication for implementation. +class LocalJubatusApplication(jubatus: Process, name: String, jubaCmdName: String) + extends JubatusYarnApplication(Location(InetAddress.getLocalHost, 9199), List(), null) { + + override def status: JubatusYarnApplicationStatus = { + throw new NotImplementedError("status is not implemented") + } + + override def stop(): scala.concurrent.Future[Unit] = scala.concurrent.Future { + logger.info(s"stop LocalJubatusApplication (name: $name, $jubaCmdName)") + Thread.sleep(200) // This sleep prevents zombie jubatus + jubatus.destroy() + jubatus.waitFor() + } + + override def kill() { + throw new NotImplementedError("kill is not implemented") + } + + override def loadModel(aModelPathPrefix: org.apache.hadoop.fs.Path, aModelId: String): Try[JubatusYarnApplication] = Try { + throw new NotImplementedError("loadModel is not implemented") + } + + override def saveModel(aModelPathPrefix: org.apache.hadoop.fs.Path, aModelId: String): Try[JubatusYarnApplication] = Try { + throw new NotImplementedError("saveModel is not implemented") + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/MaxOptionAccumulatorParam.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/MaxOptionAccumulatorParam.scala new file mode 100644 index 0000000..5a1c894 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/MaxOptionAccumulatorParam.scala @@ -0,0 +1,41 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.apache.spark.AccumulatorParam + +/** + * Remembers the largest item that was added. + * + * Taking the maximum is an associative operation, therefore + * we can implement it as an accumulator. + */ +class MaxOptionAccumulatorParam[U](implicit ord: Ordering[U]) extends AccumulatorParam[Option[U]] { + override def zero(initialValue: Option[U]): Option[U] = initialValue + + override def addInPlace(r1: Option[U], r2: Option[U]): Option[U] = { + (r1, r2) match { + case (Some(a), Some(b)) => + Some(ord.max(a, b)) + case (Some(a), None) => + r1 + case (None, Some(b)) => + r2 + case _ => + None + } + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/RegistrationHandler.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/RegistrationHandler.scala new file mode 100644 index 0000000..a3a843a --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/RegistrationHandler.scala @@ -0,0 +1,58 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import com.typesafe.scalalogging.slf4j.LazyLogging +import dispatch._ +import dispatch.Defaults._ +import org.json4s.DefaultFormats +import org.json4s.native.Serialization +import jubaql.gateway.json.{Unregister, Register} +import scala.util.{Failure, Success, Try} + +class RegistrationHandler(val registerUrl: String) extends LazyLogging { + // RequestBuilder is mutable, therefore we need to work on a new copy + // for every request (i.e. use `def` instead of `val`). + def newReq = url(registerUrl) + + implicit val formats = DefaultFormats + + def register: Either[Throwable, String] = { + val (host, port) = JubaQLProcessor.getListeningAddress + val registerMsg = Register("register", host.getHostAddress, port) + val req = newReq << Serialization.write(registerMsg) + makeHttpRequest(req) + } + + def unregister: Either[Throwable, String] = { + val unregisterMsg = Unregister("unregister") + val req = newReq << Serialization.write(unregisterMsg) + makeHttpRequest(req) + } + + protected def makeHttpRequest(req: Req) = { + Try { + // if the URL is invalid, that exception will be thrown instead + // of wrapped into Either, so we have to wrap it ourselves + Http(req OK as.String).either.apply + } match { + case Success(someEither) => + someEither + case Failure(someException) => + Left(someException) + } + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/StringWrapper.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/StringWrapper.scala new file mode 100644 index 0000000..aecfaed --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/StringWrapper.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +case class StringWrapper(value: String) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/AnomalyScore.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/AnomalyScore.scala new file mode 100644 index 0000000..bff5aed --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/AnomalyScore.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.json + +case class AnomalyScore(score: Float) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierPrediction.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierPrediction.scala new file mode 100644 index 0000000..89ffe04 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierPrediction.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.json + +case class ClassifierPrediction(label: String, score: Double) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierResult.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierResult.scala new file mode 100644 index 0000000..e163751 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierResult.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.json + +case class ClassifierResult(predictions: List[ClassifierPrediction]) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/DatumResult.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/DatumResult.scala new file mode 100644 index 0000000..32edaa1 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/DatumResult.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.json + +case class DatumResult(string_values: Map[String, String], num_values: Map[String, Double]) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/Register.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/Register.scala new file mode 100644 index 0000000..f94cf13 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/Register.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package jubaql.gateway.json + +case class Register(action: String, ip: String, port: Int) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/Unregister.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/Unregister.scala new file mode 100644 index 0000000..0f00550 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/json/Unregister.scala @@ -0,0 +1,18 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package jubaql.gateway.json + +case class Unregister(action: String) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/logical/RegisterAsTable.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/logical/RegisterAsTable.scala new file mode 100644 index 0000000..b9bc9a6 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/logical/RegisterAsTable.scala @@ -0,0 +1,28 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.logical + +import org.apache.spark.sql.catalyst.plans.logical.{UnaryNode, LogicalPlan} + +/* + * This logical plan has only the purpose of triggering the + * assignment of a table name to this RDD after processing + * (i.e., exists so that we can recognize that this is a + * "CREATE JSON_TABLE" statement even outside of the parser). + */ +case class RegisterAsTable(child: LogicalPlan, tableName: String) extends UnaryNode { + override def output = child.output +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Anomaly.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Anomaly.scala new file mode 100644 index 0000000..0575deb --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Anomaly.scala @@ -0,0 +1,40 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.updater + +import us.jubat.jubaql_server.processor.CreateModel +import org.json4s._ +import us.jubat.anomaly.AnomalyClient + +class Anomaly(val jubaHost: String, jubaPort: Int, cm: CreateModel, val keys: List[String]) extends Updater with Serializable { + override def apply(iter: Iterator[JValue], statusUrl: String): Iterator[Unit] = { + HttpClientPerJvm.startChecking(statusUrl) + val client = new AnomalyClient(jubaHost, jubaPort, cm.modelName, 5) + val logger = createLogger + logger.info(s"started AnomalyClient: $client") + var stopped_? = HttpClientPerJvm.stopped + val out = iter.takeWhile(_ => !stopped_?).zipWithIndex.map(valueWithIndex => { + val (jvalue, idx) = valueWithIndex + client.add(extractDatum(keys, jvalue)) + if ((idx+1) % 1000 == 0) { + logger.debug("processed 1000 items using 'add' method") + stopped_? = HttpClientPerJvm.stopped + } + () + }) + out + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Classifier.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Classifier.scala new file mode 100644 index 0000000..807e95d --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Classifier.scala @@ -0,0 +1,52 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.updater + +import us.jubat.jubaql_server.processor.CreateModel +import org.json4s._ +import us.jubat.classifier.{ClassifierClient, LabeledDatum} + +class Classifier(val jubaHost: String, jubaPort: Int, cm: CreateModel, val keys: List[String]) extends Updater with Serializable { + override def apply(iter: Iterator[JValue], statusUrl: String): Iterator[Unit] = { + HttpClientPerJvm.startChecking(statusUrl) + val client = new ClassifierClient(jubaHost, jubaPort, cm.modelName, 5) + val logger = createLogger + logger.info(s"started ClassifierClient: $client") + val label = cm.specifier.toMap.get("label") match { + case Some(la :: Nil) => la + case _ => ??? // TODO: throw exception + } + var stopped_? = HttpClientPerJvm.stopped + val out = iter.takeWhile(_ => !stopped_?).zipWithIndex.map(valueWithIndex => { + val (jvalue, idx) = valueWithIndex + // find string for label + jvalue \ label match { + case JString(trainLabel) => + val data = new java.util.LinkedList[LabeledDatum]() + data.add(new LabeledDatum(trainLabel, extractDatum(keys, jvalue))) + client.train(data) + if ((idx+1) % 1000 == 0) { + logger.debug("processed 1000 items using 'train' method") + stopped_? = HttpClientPerJvm.stopped + } + case _ => + // `label` string field not found + } + () + }) + out + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/HttpClientPerJvm.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/HttpClientPerJvm.scala new file mode 100644 index 0000000..b11383d --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/HttpClientPerJvm.scala @@ -0,0 +1,56 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.updater + +import scala.concurrent.future +import scala.concurrent.ExecutionContext.Implicits.global +import dispatch._ +import com.typesafe.scalalogging.slf4j.LazyLogging + +object HttpClientPerJvm extends LazyLogging { + // flag to store whether we are already running the status checker + protected var running = false + + protected var _stopped = false + + def stopped: Boolean = _stopped + + def startChecking(statusUrl: String) = synchronized { + // only start the check if none is running already + if (!running) { + // start in a separate thread + future { + val h = new Http() + // unless we see a "shutdown" message or an error, keep running + while (!_stopped) { + h(url(statusUrl) OK as.String).option.apply() match { + case Some("shutdown") | None => + // if we see shutdown state, stop running + logger.debug("status switched to 'shutdown', stopping") + synchronized { + _stopped = true + } + case _ => + } + Thread.sleep(1000) + } + h.shutdown() + } + // indicate that we are running now + running = true + } + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Recommender.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Recommender.scala new file mode 100644 index 0000000..e0bb245 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Recommender.scala @@ -0,0 +1,47 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.updater + +import us.jubat.jubaql_server.processor.CreateModel +import org.json4s._ +import us.jubat.recommender.RecommenderClient + +class Recommender(val jubaHost: String, jubaPort: Int, cm: CreateModel, val id: String, val keys: List[String]) extends Updater with Serializable { + override def apply(iter: Iterator[JValue], statusUrl: String): Iterator[Unit] = { + HttpClientPerJvm.startChecking(statusUrl) + val client = new RecommenderClient(jubaHost, jubaPort, cm.modelName, 5) + val logger = createLogger + logger.info(s"started RecommenderClient: $client") + var stopped_? = HttpClientPerJvm.stopped + val out = iter.takeWhile(_ => !stopped_?).zipWithIndex.map(valueWithIndex => { + val (jvalue, idx) = valueWithIndex + // update_row + jvalue \ id match { + case JString(updateId) => + val datum = extractDatum(keys, jvalue) + client.updateRow(updateId, datum) + if ((idx+1) % 1000 == 0) { + logger.debug("processed 1000 items using 'updateRow' method") + stopped_? = HttpClientPerJvm.stopped + } + case _ => + // `id` string field not found + } + () + }) + out + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Updater.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Updater.scala new file mode 100644 index 0000000..00478c1 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Updater.scala @@ -0,0 +1,54 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor.updater + +import org.json4s._ +import us.jubat.common.Datum +import com.typesafe.scalalogging.slf4j.Logger +import org.slf4j.LoggerFactory + +trait Updater { + def apply(iter: scala.Iterator[JValue], statusUrl: String): Iterator[Unit] + + protected def createLogger: Logger = { + Logger(LoggerFactory getLogger getClass.getName) + } + + protected def extractDatum(keys: List[String], jvalue: JValue): Datum = { + // filter unused filed + val filtered = jvalue.filterField { + case JField(key, _) => keys.indexOf(key) >= 0 + case _ => false + } + + var datum = new Datum + filtered.foreach({ + j => + val key = j._1 + j._2 match { + case JInt(v) => + datum.addNumber(key, v.toDouble) + case JDouble(v) => + datum.addNumber(key, v) + case JString(v) => + datum.addString(key, v) + case _ => + } + j + }) + datum + } +} diff --git a/processor/src/test/resources/core-site.xml.dist b/processor/src/test/resources/core-site.xml.dist new file mode 100644 index 0000000..3e3540a --- /dev/null +++ b/processor/src/test/resources/core-site.xml.dist @@ -0,0 +1,128 @@ + + + + + + fs.defaultFS + hdfs://[host]:[port] + + + fs.trash.interval + 1 + + + io.compression.codecs + org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec + + + hadoop.security.authentication + simple + + + hadoop.security.authorization + false + + + hadoop.rpc.protection + authentication + + + hadoop.ssl.require.client.cert + false + true + + + hadoop.ssl.keystores.factory.class + org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory + true + + + hadoop.ssl.server.conf + ssl-server.xml + true + + + hadoop.ssl.client.conf + ssl-client.xml + true + + + hadoop.security.auth_to_local + DEFAULT + + + hadoop.proxyuser.oozie.hosts + * + + + hadoop.proxyuser.oozie.groups + * + + + hadoop.proxyuser.mapred.hosts + * + + + hadoop.proxyuser.mapred.groups + * + + + hadoop.proxyuser.flume.hosts + * + + + hadoop.proxyuser.flume.groups + * + + + hadoop.proxyuser.HTTP.hosts + * + + + hadoop.proxyuser.HTTP.groups + * + + + hadoop.proxyuser.hive.hosts + * + + + hadoop.proxyuser.hive.groups + * + + + hadoop.proxyuser.hue.hosts + * + + + hadoop.proxyuser.hue.groups + * + + + hadoop.proxyuser.httpfs.hosts + * + + + hadoop.proxyuser.httpfs.groups + * + + + hadoop.proxyuser.hdfs.groups + * + + + hadoop.proxyuser.hdfs.hosts + * + + + hadoop.security.group.mapping + org.apache.hadoop.security.ShellBasedUnixGroupsMapping + + + hadoop.security.instrumentation.requires.admin + false + + diff --git a/processor/src/test/resources/dummydata/1.json b/processor/src/test/resources/dummydata/1.json new file mode 100644 index 0000000..b0c72b5 --- /dev/null +++ b/processor/src/test/resources/dummydata/1.json @@ -0,0 +1,2 @@ +{"video_id": 1} +{"video_id": 2} diff --git a/processor/src/test/resources/dummydata/2.json b/processor/src/test/resources/dummydata/2.json new file mode 100644 index 0000000..9526c4f --- /dev/null +++ b/processor/src/test/resources/dummydata/2.json @@ -0,0 +1,2 @@ +{"video_id": 3} +{"video_id": 4} diff --git a/processor/src/test/resources/hdfs-site.xml.dist b/processor/src/test/resources/hdfs-site.xml.dist new file mode 100644 index 0000000..790de19 --- /dev/null +++ b/processor/src/test/resources/hdfs-site.xml.dist @@ -0,0 +1,68 @@ + + + + + + dfs.namenode.name.dir + file:///dfs/nn + + + dfs.namenode.servicerpc-address + [host]:[port] + + + dfs.https.address + [host]:[port] + + + dfs.https.port + 50470 + + + dfs.namenode.http-address + [host]:[port] + + + dfs.replication + 3 + + + dfs.blocksize + 134217728 + + + dfs.client.use.datanode.hostname + false + + + fs.permissions.umask-mode + 022 + + + dfs.namenode.acls.enabled + false + + + dfs.client.read.shortcircuit + false + + + dfs.domain.socket.path + /var/run/hdfs-sockets/dn + + + dfs.client.read.shortcircuit.skip.checksum + false + + + dfs.client.domain.socket.data.traffic + false + + + dfs.datanode.hdfs-blocks-metadata.enabled + true + + diff --git a/processor/src/test/resources/kafka.xml.dist b/processor/src/test/resources/kafka.xml.dist new file mode 100644 index 0000000..634f690 --- /dev/null +++ b/processor/src/test/resources/kafka.xml.dist @@ -0,0 +1,6 @@ + + + + +[kafka path] + diff --git a/processor/src/test/resources/lof.json b/processor/src/test/resources/lof.json new file mode 100644 index 0000000..93286fe --- /dev/null +++ b/processor/src/test/resources/lof.json @@ -0,0 +1,32 @@ +{ + "converter" : { + "string_filter_types" : {}, + "string_filter_rules" : [], + "num_filter_types" : {}, + "num_filter_rules" : [], + "string_types": { + "unigram": { "method": "ngram", "char_num": "1" } + }, + "string_rules" : [ + { "key" : "*", "type" : "unigram", "sample_weight" : "bin", "global_weight" : "bin" } + ], + "num_types" : {}, + "num_rules" : [ + { "key" : "*", "type" : "num" } + ] + }, + "parameter" : { + "nearest_neighbor_num" : 10, + "reverse_nearest_neighbor_num" : 30, + "method" : "euclid_lsh", + "parameter" : { + "hash_num" : 64, + "table_num" : 4, + "seed" : 1091, + "probe_num" : 64, + "bin_width" : 100, + "retain_projection" : false + } + }, + "method" : "lof" +} diff --git a/processor/src/test/resources/log4j.xml b/processor/src/test/resources/log4j.xml new file mode 100644 index 0000000..1c2ac4b --- /dev/null +++ b/processor/src/test/resources/log4j.xml @@ -0,0 +1,59 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/processor/src/test/resources/npb_similar_player.csv b/processor/src/test/resources/npb_similar_player.csv new file mode 100644 index 0000000..c9c6fd6 --- /dev/null +++ b/processor/src/test/resources/npb_similar_player.csv @@ -0,0 +1,144 @@ +長野久義,巨人,0.301,144,653,574,173,14,60,20,75,1,100,2,5,0.432,0.382,0.815,6.45,6.2 +大島洋平,中日,0.31,144,631,555,172,1,13,32,46,13,80,17,7,0.368,0.376,0.744,5.13,4.91 +鳥谷敬,阪神,0.262,144,624,515,135,8,59,15,94,2,91,5,12,0.375,0.373,0.748,5.28,5.33 +坂本勇人,巨人,0.311,144,619,557,173,14,69,16,39,6,90,12,5,0.456,0.359,0.815,6.29,6.06 +中田翔,日本ハム,0.239,144,606,547,131,24,77,5,50,5,101,0,8,0.42,0.307,0.727,4.44,4.54 +李大浩,オリックス,0.286,144,601,525,150,24,91,0,64,7,85,0,18,0.478,0.368,0.846,5.99,5.95 +陽岱鋼,日本ハム,0.287,144,599,533,153,7,55,17,37,6,123,18,10,0.398,0.337,0.735,4.7,4.59 +糸井嘉男,日本ハム,0.304,134,597,510,155,9,48,22,75,11,86,0,9,0.41,0.404,0.813,6.47,6.26 +聖澤諒,楽天,0.27,138,595,523,141,4,45,54,49,7,104,12,4,0.331,0.338,0.669,4.55,4.54 +田中浩康,ヤクルト,0.274,139,593,486,133,2,40,1,54,9,60,40,15,0.323,0.354,0.677,3.89,3.86 +和田一浩,中日,0.285,144,586,508,145,9,63,2,71,1,72,0,14,0.409,0.37,0.78,5.57,5.47 +根元俊一,ロッテ,0.279,133,584,512,143,9,41,6,31,1,98,40,7,0.396,0.322,0.718,4.16,4.01 +ヘルマン,西武,0.27,144,583,507,137,3,60,41,57,4,75,10,20,0.343,0.346,0.689,4.06,4.05 +井口資仁,ロッテ,0.255,140,578,505,129,11,60,3,53,16,99,0,11,0.384,0.343,0.727,4.73,4.7 +村田修一,巨人,0.252,144,575,516,130,12,58,1,36,15,85,2,16,0.374,0.316,0.69,3.83,3.89 +梵英心,広島,0.244,137,575,499,122,10,52,14,50,1,70,19,7,0.359,0.311,0.67,3.77,3.85 +バルディリス,オリックス,0.264,143,574,503,133,10,55,1,54,14,76,0,13,0.394,0.35,0.744,4.9,4.8 +荒木雅博,中日,0.251,129,569,510,128,3,31,12,19,2,65,36,4,0.314,0.28,0.593,2.93,2.89 +内川聖一,ソフトバンク,0.3,138,567,523,157,7,53,6,31,6,36,0,12,0.392,0.342,0.734,4.81,4.73 +中島裕之,西武,0.311,136,567,499,155,13,74,7,52,9,76,1,10,0.451,0.382,0.833,6.29,6.14 +明石健志,ソフトバンク,0.254,135,567,508,129,1,27,25,33,2,98,23,2,0.299,0.301,0.601,3.21,3.21 +サブロー,ロッテ,0.239,137,560,476,114,7,52,0,78,2,105,0,12,0.342,0.346,0.689,4.29,4.32 +阿部慎之助,巨人,0.34,138,556,467,159,27,104,0,69,9,47,2,11,0.565,0.429,0.994,9,8.79 +堂林翔太,広島,0.242,144,554,488,118,14,45,5,44,14,150,5,8,0.395,0.321,0.716,4.4,4.42 +井端弘和,中日,0.284,140,553,489,139,2,35,4,52,3,58,8,14,0.331,0.356,0.687,4.18,4.06 +荒波翔,DeNA,0.268,141,550,504,135,1,25,24,23,5,90,16,2,0.333,0.305,0.639,3.59,3.52 +小谷野栄一,日本ハム,0.228,134,550,478,109,3,39,6,27,4,72,40,8,0.293,0.275,0.567,2.48,2.47 +後藤光尊,オリックス,0.242,131,546,520,126,4,43,3,14,5,74,1,8,0.306,0.266,0.572,2.6,2.69 +ミレッジ,ヤクルト,0.3,125,546,476,143,21,65,9,57,6,79,3,11,0.485,0.379,0.865,6.46,6.34 +本多雄一,ソフトバンク,0.246,123,536,480,118,0,31,34,37,1,65,14,6,0.294,0.299,0.593,3.26,3.3 +角中勝也,ロッテ,0.312,128,525,477,149,3,61,8,38,5,68,1,9,0.415,0.366,0.782,5.67,5.43 +平野恵一,阪神,0.245,134,519,458,112,1,24,6,42,4,61,15,4,0.271,0.313,0.584,3.12,3.13 +ペーニャ,ソフトバンク,0.28,130,507,461,129,21,76,2,35,8,130,0,11,0.49,0.339,0.829,5.8,5.68 +川端慎吾,ヤクルト,0.298,125,507,453,135,4,49,3,35,2,56,13,7,0.38,0.348,0.728,4.82,4.72 +田中賢介,日本ハム,0.3,114,505,457,137,3,32,13,35,2,36,8,2,0.363,0.35,0.713,4.81,4.7 +ラミレス,DeNA,0.3,137,504,476,143,19,76,0,18,7,60,0,18,0.473,0.333,0.806,5.26,5.13 +今江敏晃,ロッテ,0.253,136,501,446,113,6,47,0,19,5,37,23,15,0.354,0.287,0.641,2.98,3.08 +中村紀洋,DeNA,0.274,126,500,442,121,11,61,1,50,2,72,0,14,0.407,0.346,0.753,4.86,4.87 +中村剛也,西武,0.231,123,498,432,100,27,79,2,56,9,125,0,11,0.461,0.331,0.792,5.04,5.16 +稲葉篤紀,日本ハム,0.29,127,497,449,130,10,61,0,32,5,70,8,3,0.421,0.342,0.762,5.38,5.22 +畠山和洋,ヤクルト,0.266,121,497,455,121,13,55,2,37,2,64,1,13,0.402,0.323,0.725,4.39,4.35 +森野将彦,中日,0.249,124,494,434,108,6,50,1,49,3,68,4,13,0.348,0.327,0.674,3.87,3.89 +新井貴浩,阪神,0.25,122,493,460,115,9,52,1,30,1,85,0,12,0.363,0.296,0.659,3.43,3.4 +フェルナンデス,楽天,0.243,129,489,440,107,3,51,2,44,2,67,0,11,0.311,0.313,0.624,3.25,3.28 +銀次,楽天,0.28,126,485,432,121,4,45,8,22,4,37,23,4,0.354,0.318,0.672,3.98,3.94 +川端崇義,オリックス,0.266,125,484,429,114,2,27,6,22,9,59,24,8,0.317,0.315,0.632,3.36,3.25 +岡田幸文,ロッテ,0.262,131,482,431,113,0,18,23,18,6,47,25,4,0.288,0.3,0.587,2.82,2.85 +長谷川勇也,ソフトバンク,0.278,126,473,403,112,4,37,16,40,9,87,16,4,0.36,0.352,0.712,4.77,4.76 +マートン,阪神,0.26,121,473,453,118,5,38,2,18,1,56,0,14,0.342,0.29,0.632,3.01,2.96 +栗山巧,西武,0.289,103,467,394,114,2,33,3,52,6,62,12,4,0.353,0.378,0.731,5.28,5.18 +浅村栄斗,西武,0.245,114,459,404,99,7,37,13,34,4,63,13,9,0.376,0.307,0.683,3.79,3.84 +谷繁元信,中日,0.228,134,458,386,88,5,32,0,52,4,67,14,15,0.303,0.324,0.627,3.11,3.16 +秋山翔吾,西武,0.293,107,450,403,118,4,37,10,28,3,70,15,7,0.404,0.343,0.747,4.82,4.63 +筒香嘉智,DeNA,0.218,108,446,386,84,10,45,1,51,2,102,2,4,0.352,0.309,0.661,3.8,3.98 +高橋由伸,巨人,0.239,130,442,368,88,8,56,2,61,8,77,1,14,0.351,0.356,0.707,4.23,4.32 +里崎智也,ロッテ,0.244,120,439,385,94,9,41,0,33,4,80,14,11,0.345,0.308,0.654,3.38,3.46 +松井稼頭央,楽天,0.266,106,433,402,107,9,43,14,26,1,55,3,6,0.408,0.312,0.72,4.58,4.45 +大引啓次,オリックス,0.224,110,432,352,79,6,20,6,46,3,74,28,8,0.315,0.317,0.632,3.33,3.44 +牧田明久,楽天,0.225,123,425,378,85,9,53,5,23,9,49,10,11,0.331,0.283,0.613,2.68,2.86 +バレンティン,ヤクルト,0.272,106,422,353,96,31,81,1,64,3,92,0,14,0.572,0.386,0.958,7.34,7.35 +炭谷銀仁朗,西武,0.194,139,414,360,70,0,23,0,17,1,74,35,9,0.233,0.232,0.466,1.17,1.23 +金本知憲,阪神,0.258,126,406,356,92,6,30,3,49,0,54,0,2,0.36,0.347,0.707,4.78,4.74 +T-岡田,オリックス,0.28,103,404,378,106,10,56,4,22,4,81,0,5,0.418,0.327,0.745,4.87,4.7 +天谷宗一郎,広島,0.265,108,397,359,95,6,25,12,32,3,64,2,3,0.373,0.329,0.702,4.33,4.25 +宮本慎也,ヤクルト,0.267,110,394,356,95,3,23,1,16,5,38,16,9,0.306,0.307,0.613,3.08,3.05 +松田宣浩,ソフトバンク,0.3,95,390,360,108,9,56,16,27,1,63,0,7,0.492,0.349,0.84,5.87,5.59 +新井良太,阪神,0.28,110,370,322,90,11,32,1,38,6,73,3,7,0.438,0.365,0.803,5.77,5.65 +スケールズ,オリックス,0.262,85,362,305,80,5,23,4,47,8,103,0,3,0.384,0.373,0.757,5.6,5.54 +ブランコ,中日,0.248,96,359,311,77,24,65,2,40,5,84,0,5,0.511,0.34,0.851,6.06,6.22 +小久保裕紀,ソフトバンク,0.239,103,357,331,79,4,34,0,20,0,67,3,6,0.308,0.28,0.588,2.8,2.89 +鉄平,楽天,0.251,121,355,315,79,1,33,8,11,6,45,18,7,0.302,0.285,0.586,2.62,2.75 +大和,阪神,0.257,128,349,311,80,0,26,17,15,4,46,19,3,0.318,0.3,0.618,3.19,3.1 +今宮健太,ソフトバンク,0.238,126,343,307,73,2,14,8,10,3,75,21,2,0.29,0.267,0.557,2.44,2.51 +丸佳浩,広島,0.247,106,339,283,70,4,22,14,46,2,59,5,4,0.353,0.353,0.707,4.62,4.7 +廣瀬純,広島,0.241,102,337,294,71,6,33,0,24,11,55,7,3,0.347,0.321,0.668,4.02,4.02 +金城龍彦,DeNA,0.238,129,331,294,70,3,18,2,26,6,37,3,1,0.306,0.311,0.617,3.58,3.65 +鶴岡慎也,日本ハム,0.266,116,328,289,77,0,25,3,14,1,41,23,11,0.308,0.302,0.61,2.68,2.61 +ホフパワー,日本ハム,0.247,109,325,296,73,14,37,1,25,3,81,0,3,0.432,0.311,0.743,4.74,4.77 +嶋基宏,楽天,0.291,91,316,265,77,1,8,3,33,4,51,13,5,0.332,0.376,0.708,4.53,4.47 +金子誠,日本ハム,0.227,103,311,277,63,0,22,1,15,0,53,17,2,0.292,0.265,0.558,2.3,2.31 +石川雄洋,DeNA,0.285,80,304,263,75,1,14,7,24,3,49,13,3,0.342,0.351,0.693,4.21,4.11 +平田良介,中日,0.216,91,301,269,58,11,32,1,28,1,59,1,9,0.361,0.29,0.651,3.26,3.47 +大崎雄太朗,西武,0.269,107,298,260,70,1,22,1,17,7,27,10,4,0.346,0.326,0.673,4.06,4.04 +ガルシア,楽天,0.227,77,296,269,61,7,30,0,23,2,65,0,6,0.349,0.291,0.64,3.27,3.38 +ブラゼル,阪神,0.233,98,295,275,64,12,43,0,18,2,72,0,16,0.404,0.285,0.688,3.15,3.21 +清田育宏,ロッテ,0.281,87,292,253,71,3,29,5,34,0,46,3,5,0.391,0.363,0.755,5,4.9 +枡田慎太郎,楽天,0.295,79,290,254,75,5,32,1,19,5,56,7,2,0.433,0.35,0.783,5.55,5.51 +内村賢介,DeNA,0.237,77,289,236,56,0,18,18,31,0,40,21,4,0.263,0.325,0.587,3.19,3.22 +森岡良介,ヤクルト,0.249,100,282,245,61,1,18,1,11,5,44,19,1,0.318,0.293,0.611,3.16,3.17 +東出輝裕,広島,0.247,91,282,247,61,0,6,1,13,3,22,19,3,0.279,0.293,0.572,2.62,2.57 +藤村大介,巨人,0.252,109,279,238,60,0,10,14,15,0,35,26,2,0.298,0.296,0.595,2.84,2.79 +大松尚逸,ロッテ,0.198,89,275,258,51,5,22,0,9,2,46,3,7,0.291,0.228,0.519,1.7,1.91 +ホワイトセル,ロッテ,0.309,63,257,223,69,9,43,0,29,4,75,0,2,0.489,0.397,0.886,7.37,7.16 +谷佳知,巨人,0.258,89,255,229,59,3,22,1,14,0,48,11,5,0.319,0.299,0.618,2.97,2.97 +梶谷隆幸,DeNA,0.179,80,252,223,40,2,11,5,21,1,61,7,4,0.256,0.253,0.509,1.48,1.64 +エルドレッド,広島,0.262,65,251,225,59,11,35,0,20,3,67,0,3,0.453,0.327,0.78,5.29,5.38 +石原慶幸,広島,0.24,77,250,217,52,1,22,0,22,1,37,9,7,0.323,0.311,0.634,3.11,2.99 +中村悠平,ヤクルト,0.254,91,249,209,53,1,15,1,28,2,31,8,10,0.292,0.344,0.636,3.23,3.25 +赤松真人,広島,0.242,76,246,207,50,3,15,18,18,4,33,16,4,0.314,0.313,0.627,3.62,3.65 +相川亮二,ヤクルト,0.245,72,244,220,54,1,28,0,20,1,33,1,7,0.295,0.309,0.604,2.98,3 +岩本貴裕,広島,0.268,71,242,239,64,6,27,1,2,0,43,1,5,0.406,0.274,0.68,3.63,3.5 +菊池涼介,広島,0.229,63,234,201,46,2,12,4,6,1,42,25,5,0.294,0.254,0.547,1.99,2.04 +片岡易之,西武,0.225,52,231,204,46,2,19,8,15,1,30,7,6,0.294,0.277,0.571,2.19,2.44 +森本稀哲,DeNA,0.244,108,230,201,49,3,18,0,20,3,44,4,6,0.323,0.32,0.643,3.27,3.34 +鶴岡一成,DeNA,0.189,102,229,201,38,1,15,0,19,0,39,8,4,0.254,0.258,0.512,1.87,1.94 +松本哲也,巨人,0.258,83,229,198,51,0,11,12,17,0,34,12,4,0.313,0.313,0.626,3.5,3.5 +藤井彰人,阪神,0.248,81,228,210,52,1,10,0,9,2,39,7,3,0.286,0.285,0.571,2.59,2.54 +寺内崇幸,巨人,0.241,103,225,191,46,1,5,11,16,3,43,13,6,0.293,0.307,0.6,2.97,3.06 +上本博紀,阪神,0.254,62,224,197,50,1,7,13,19,5,46,2,3,0.345,0.333,0.679,4.26,4.18 +多村仁志,ソフトバンク,0.25,79,218,200,50,4,20,0,18,0,43,0,7,0.365,0.312,0.677,3.57,3.53 +細川亨,ソフトバンク,0.157,92,217,185,29,2,13,0,8,1,50,21,1,0.205,0.194,0.399,0.72,0.98 +福地寿樹,ヤクルト,0.255,83,216,188,48,0,19,12,16,4,29,6,3,0.335,0.324,0.659,4.17,4.1 +山崎武司,中日,0.209,90,215,191,40,1,13,1,18,4,44,0,5,0.293,0.288,0.582,2.72,2.79 +オーティズ,西武,0.286,64,214,199,57,9,21,0,14,1,38,0,8,0.462,0.336,0.799,5,4.91 +渡辺直人,DeNA,0.224,70,213,174,39,0,10,2,24,10,27,5,1,0.247,0.351,0.598,3.35,3.44 +柳田悠岐,ソフトバンク,0.246,68,212,195,48,5,18,6,10,5,56,2,2,0.385,0.3,0.685,4.14,4.05 +高須洋介,楽天,0.244,58,212,176,43,0,9,2,24,2,15,8,5,0.284,0.338,0.622,3.47,3.53 +ボウカー,巨人,0.196,69,204,184,36,3,10,2,16,2,54,2,3,0.31,0.267,0.577,2.64,2.66 +ニック,広島,0.238,52,204,181,43,9,24,0,22,1,42,0,4,0.431,0.324,0.754,4.78,4.79 +福浦和也,ロッテ,0.25,84,199,180,45,0,25,0,15,1,27,1,3,0.267,0.308,0.575,2.92,3.03 +伊藤光,オリックス,0.205,66,197,176,36,0,10,0,5,0,48,12,2,0.256,0.222,0.477,1.49,1.7 +エドガー,巨人,0.236,57,190,174,41,4,19,0,13,1,46,1,6,0.374,0.291,0.665,3.17,3.19 +藤田一也,楽天,0.308,63,188,172,53,0,15,5,4,0,10,11,4,0.378,0.322,0.7,4.05,3.84 +荻野貴司,ロッテ,0.224,61,187,165,37,1,8,13,11,5,17,6,0,0.273,0.293,0.566,3.11,3.14 +堂上直倫,中日,0.21,116,182,167,35,0,11,1,5,1,34,8,6,0.257,0.236,0.493,1.39,1.42 +江川智晃,ソフトバンク,0.244,56,175,160,39,4,18,1,8,0,37,4,3,0.388,0.275,0.662,3.5,3.7 +上田剛史,ヤクルト,0.257,50,173,148,38,0,12,8,12,0,30,13,2,0.318,0.313,0.63,3.23,3.15 +小宮山慎二,阪神,0.148,72,171,149,22,1,4,0,11,2,32,9,4,0.174,0.216,0.391,0.53,0.69 +堂上剛裕,中日,0.282,85,166,156,44,4,17,1,8,0,36,1,4,0.41,0.315,0.725,4.3,4.19 +坂口智隆,オリックス,0.228,40,165,158,36,0,8,2,5,0,13,2,0,0.253,0.252,0.505,1.96,2.01 +松中信彦,ソフトバンク,0.221,65,164,136,30,4,13,1,26,2,26,0,1,0.324,0.354,0.677,4.6,4.69 +倉義和,広島,0.195,70,160,133,26,1,15,1,14,5,20,8,6,0.263,0.296,0.559,2.23,2.28 +鈴木大地,ロッテ,0.274,62,160,135,37,0,11,0,13,1,23,10,1,0.326,0.34,0.666,4.07,4 +大野奨太,日本ハム,0.171,70,159,140,24,2,11,0,8,2,30,9,2,0.243,0.227,0.47,1.2,1.35 +北川博敏,オリックス,0.221,59,159,140,31,1,11,0,17,0,23,0,3,0.286,0.302,0.588,2.95,3.12 +柴田講平,阪神,0.234,73,157,128,30,0,2,4,16,1,23,11,1,0.273,0.322,0.595,3.03,3.12 +小池正晃,DeNA,0.192,88,155,130,25,3,19,0,10,8,31,6,5,0.292,0.289,0.581,2.43,2.63 +西川遥輝,日本ハム,0.239,71,155,134,32,2,13,7,14,0,34,7,0,0.343,0.311,0.654,4.18,4.11 +雄平,ヤクルト,0.28,47,153,143,40,0,8,2,7,0,19,3,0,0.308,0.313,0.621,3.4,3.31 +スレッジ,日本ハム,0.232,47,152,138,32,5,23,0,12,2,47,0,2,0.406,0.303,0.708,4.22,4.22 +下園辰哉,DeNA,0.252,90,150,139,35,0,14,0,10,0,21,1,6,0.324,0.302,0.626,2.93,2.77 +松井淳,ヤクルト,0.287,46,150,143,41,5,15,1,6,1,26,0,5,0.462,0.32,0.782,4.79,4.6 +野中信吾,オリックス,0.217,74,149,120,26,0,7,12,11,2,20,16,2,0.275,0.293,0.568,2.4,2.44 +古城茂幸,巨人,0.209,65,149,129,27,0,8,2,12,0,28,6,5,0.256,0.273,0.529,1.96,2.07 diff --git a/processor/src/test/resources/npb_similar_player.json b/processor/src/test/resources/npb_similar_player.json new file mode 100644 index 0000000..f95a474 --- /dev/null +++ b/processor/src/test/resources/npb_similar_player.json @@ -0,0 +1,16 @@ +{ + "method": "inverted_index", + "converter": { + "string_filter_types": {}, + "string_filter_rules": [], + "num_filter_types": {}, + "num_filter_rules": [], + "string_types": {}, + "string_rules": [], + "num_types": {}, + "num_rules": [ + {"key" : "*", "type" : "num"} + ] + }, + "parameter": {} +} diff --git a/processor/src/test/resources/npb_similar_player_data.json b/processor/src/test/resources/npb_similar_player_data.json new file mode 100644 index 0000000..3ad6a60 --- /dev/null +++ b/processor/src/test/resources/npb_similar_player_data.json @@ -0,0 +1,144 @@ +{ "id":"長野久義", "team":"巨人", "打率":0.301, "試合数":144, "打席":653, "打数":574, "安打":173, "本塁打":14, "打点":60, "盗塁":20, "四球":75, "死球":1, "三振":100, "犠打":2, "併殺打":5, "長打率":0.432, "出塁率":0.382, "OPS":0.815, "RC27":6.45, "XR27":6.2 } +{ "id":"大島洋平", "team":"中日", "打率":0.31, "試合数":144, "打席":631, "打数":555, "安打":172, "本塁打":1, "打点":13, "盗塁":32, "四球":46, "死球":13, "三振":80, "犠打":17, "併殺打":7, "長打率":0.368, "出塁率":0.376, "OPS":0.744, "RC27":5.13, "XR27":4.91 } +{ "id":"鳥谷敬", "team":"阪神", "打率":0.262, "試合数":144, "打席":624, "打数":515, "安打":135, "本塁打":8, "打点":59, "盗塁":15, "四球":94, "死球":2, "三振":91, "犠打":5, "併殺打":12, "長打率":0.375, "出塁率":0.373, "OPS":0.748, "RC27":5.28, "XR27":5.33 } +{ "id":"坂本勇人", "team":"巨人", "打率":0.311, "試合数":144, "打席":619, "打数":557, "安打":173, "本塁打":14, "打点":69, "盗塁":16, "四球":39, "死球":6, "三振":90, "犠打":12, "併殺打":5, "長打率":0.456, "出塁率":0.359, "OPS":0.815, "RC27":6.29, "XR27":6.06 } +{ "id":"中田翔", "team":"日本ハム", "打率":0.239, "試合数":144, "打席":606, "打数":547, "安打":131, "本塁打":24, "打点":77, "盗塁":5, "四球":50, "死球":5, "三振":101, "犠打":0, "併殺打":8, "長打率":0.42, "出塁率":0.307, "OPS":0.727, "RC27":4.44, "XR27":4.54 } +{ "id":"李大浩", "team":"オリックス", "打率":0.286, "試合数":144, "打席":601, "打数":525, "安打":150, "本塁打":24, "打点":91, "盗塁":0, "四球":64, "死球":7, "三振":85, "犠打":0, "併殺打":18, "長打率":0.478, "出塁率":0.368, "OPS":0.846, "RC27":5.99, "XR27":5.95 } +{ "id":"陽岱鋼", "team":"日本ハム", "打率":0.287, "試合数":144, "打席":599, "打数":533, "安打":153, "本塁打":7, "打点":55, "盗塁":17, "四球":37, "死球":6, "三振":123, "犠打":18, "併殺打":10, "長打率":0.398, "出塁率":0.337, "OPS":0.735, "RC27":4.7, "XR27":4.59 } +{ "id":"糸井嘉男", "team":"日本ハム", "打率":0.304, "試合数":134, "打席":597, "打数":510, "安打":155, "本塁打":9, "打点":48, "盗塁":22, "四球":75, "死球":11, "三振":86, "犠打":0, "併殺打":9, "長打率":0.41, "出塁率":0.404, "OPS":0.813, "RC27":6.47, "XR27":6.26 } +{ "id":"聖澤諒", "team":"楽天", "打率":0.27, "試合数":138, "打席":595, "打数":523, "安打":141, "本塁打":4, "打点":45, "盗塁":54, "四球":49, "死球":7, "三振":104, "犠打":12, "併殺打":4, "長打率":0.331, "出塁率":0.338, "OPS":0.669, "RC27":4.55, "XR27":4.54 } +{ "id":"田中浩康", "team":"ヤクルト", "打率":0.274, "試合数":139, "打席":593, "打数":486, "安打":133, "本塁打":2, "打点":40, "盗塁":1, "四球":54, "死球":9, "三振":60, "犠打":40, "併殺打":15, "長打率":0.323, "出塁率":0.354, "OPS":0.677, "RC27":3.89, "XR27":3.86 } +{ "id":"和田一浩", "team":"中日", "打率":0.285, "試合数":144, "打席":586, "打数":508, "安打":145, "本塁打":9, "打点":63, "盗塁":2, "四球":71, "死球":1, "三振":72, "犠打":0, "併殺打":14, "長打率":0.409, "出塁率":0.37, "OPS":0.78, "RC27":5.57, "XR27":5.47 } +{ "id":"根元俊一", "team":"ロッテ", "打率":0.279, "試合数":133, "打席":584, "打数":512, "安打":143, "本塁打":9, "打点":41, "盗塁":6, "四球":31, "死球":1, "三振":98, "犠打":40, "併殺打":7, "長打率":0.396, "出塁率":0.322, "OPS":0.718, "RC27":4.16, "XR27":4.01 } +{ "id":"ヘルマン", "team":"西武", "打率":0.27, "試合数":144, "打席":583, "打数":507, "安打":137, "本塁打":3, "打点":60, "盗塁":41, "四球":57, "死球":4, "三振":75, "犠打":10, "併殺打":20, "長打率":0.343, "出塁率":0.346, "OPS":0.689, "RC27":4.06, "XR27":4.05 } +{ "id":"井口資仁", "team":"ロッテ", "打率":0.255, "試合数":140, "打席":578, "打数":505, "安打":129, "本塁打":11, "打点":60, "盗塁":3, "四球":53, "死球":16, "三振":99, "犠打":0, "併殺打":11, "長打率":0.384, "出塁率":0.343, "OPS":0.727, "RC27":4.73, "XR27":4.7 } +{ "id":"村田修一", "team":"巨人", "打率":0.252, "試合数":144, "打席":575, "打数":516, "安打":130, "本塁打":12, "打点":58, "盗塁":1, "四球":36, "死球":15, "三振":85, "犠打":2, "併殺打":16, "長打率":0.374, "出塁率":0.316, "OPS":0.69, "RC27":3.83, "XR27":3.89 } +{ "id":"梵英心", "team":"広島", "打率":0.244, "試合数":137, "打席":575, "打数":499, "安打":122, "本塁打":10, "打点":52, "盗塁":14, "四球":50, "死球":1, "三振":70, "犠打":19, "併殺打":7, "長打率":0.359, "出塁率":0.311, "OPS":0.67, "RC27":3.77, "XR27":3.85 } +{ "id":"バルディリス", "team":"オリックス", "打率":0.264, "試合数":143, "打席":574, "打数":503, "安打":133, "本塁打":10, "打点":55, "盗塁":1, "四球":54, "死球":14, "三振":76, "犠打":0, "併殺打":13, "長打率":0.394, "出塁率":0.35, "OPS":0.744, "RC27":4.9, "XR27":4.8 } +{ "id":"荒木雅博", "team":"中日", "打率":0.251, "試合数":129, "打席":569, "打数":510, "安打":128, "本塁打":3, "打点":31, "盗塁":12, "四球":19, "死球":2, "三振":65, "犠打":36, "併殺打":4, "長打率":0.314, "出塁率":0.28, "OPS":0.593, "RC27":2.93, "XR27":2.89 } +{ "id":"内川聖一", "team":"ソフトバンク", "打率":0.3, "試合数":138, "打席":567, "打数":523, "安打":157, "本塁打":7, "打点":53, "盗塁":6, "四球":31, "死球":6, "三振":36, "犠打":0, "併殺打":12, "長打率":0.392, "出塁率":0.342, "OPS":0.734, "RC27":4.81, "XR27":4.73 } +{ "id":"中島裕之", "team":"西武", "打率":0.311, "試合数":136, "打席":567, "打数":499, "安打":155, "本塁打":13, "打点":74, "盗塁":7, "四球":52, "死球":9, "三振":76, "犠打":1, "併殺打":10, "長打率":0.451, "出塁率":0.382, "OPS":0.833, "RC27":6.29, "XR27":6.14 } +{ "id":"明石健志", "team":"ソフトバンク", "打率":0.254, "試合数":135, "打席":567, "打数":508, "安打":129, "本塁打":1, "打点":27, "盗塁":25, "四球":33, "死球":2, "三振":98, "犠打":23, "併殺打":2, "長打率":0.299, "出塁率":0.301, "OPS":0.601, "RC27":3.21, "XR27":3.21 } +{ "id":"サブロー", "team":"ロッテ", "打率":0.239, "試合数":137, "打席":560, "打数":476, "安打":114, "本塁打":7, "打点":52, "盗塁":0, "四球":78, "死球":2, "三振":105, "犠打":0, "併殺打":12, "長打率":0.342, "出塁率":0.346, "OPS":0.689, "RC27":4.29, "XR27":4.32 } +{ "id":"阿部慎之助", "team":"巨人", "打率":0.34, "試合数":138, "打席":556, "打数":467, "安打":159, "本塁打":27, "打点":104, "盗塁":0, "四球":69, "死球":9, "三振":47, "犠打":2, "併殺打":11, "長打率":0.565, "出塁率":0.429, "OPS":0.994, "RC27":9, "XR27":8.79 } +{ "id":"堂林翔太", "team":"広島", "打率":0.242, "試合数":144, "打席":554, "打数":488, "安打":118, "本塁打":14, "打点":45, "盗塁":5, "四球":44, "死球":14, "三振":150, "犠打":5, "併殺打":8, "長打率":0.395, "出塁率":0.321, "OPS":0.716, "RC27":4.4, "XR27":4.42 } +{ "id":"井端弘和", "team":"中日", "打率":0.284, "試合数":140, "打席":553, "打数":489, "安打":139, "本塁打":2, "打点":35, "盗塁":4, "四球":52, "死球":3, "三振":58, "犠打":8, "併殺打":14, "長打率":0.331, "出塁率":0.356, "OPS":0.687, "RC27":4.18, "XR27":4.06 } +{ "id":"荒波翔", "team":"DeNA", "打率":0.268, "試合数":141, "打席":550, "打数":504, "安打":135, "本塁打":1, "打点":25, "盗塁":24, "四球":23, "死球":5, "三振":90, "犠打":16, "併殺打":2, "長打率":0.333, "出塁率":0.305, "OPS":0.639, "RC27":3.59, "XR27":3.52 } +{ "id":"小谷野栄一", "team":"日本ハム", "打率":0.228, "試合数":134, "打席":550, "打数":478, "安打":109, "本塁打":3, "打点":39, "盗塁":6, "四球":27, "死球":4, "三振":72, "犠打":40, "併殺打":8, "長打率":0.293, "出塁率":0.275, "OPS":0.567, "RC27":2.48, "XR27":2.47 } +{ "id":"後藤光尊", "team":"オリックス", "打率":0.242, "試合数":131, "打席":546, "打数":520, "安打":126, "本塁打":4, "打点":43, "盗塁":3, "四球":14, "死球":5, "三振":74, "犠打":1, "併殺打":8, "長打率":0.306, "出塁率":0.266, "OPS":0.572, "RC27":2.6, "XR27":2.69 } +{ "id":"ミレッジ", "team":"ヤクルト", "打率":0.3, "試合数":125, "打席":546, "打数":476, "安打":143, "本塁打":21, "打点":65, "盗塁":9, "四球":57, "死球":6, "三振":79, "犠打":3, "併殺打":11, "長打率":0.485, "出塁率":0.379, "OPS":0.865, "RC27":6.46, "XR27":6.34 } +{ "id":"本多雄一", "team":"ソフトバンク", "打率":0.246, "試合数":123, "打席":536, "打数":480, "安打":118, "本塁打":0, "打点":31, "盗塁":34, "四球":37, "死球":1, "三振":65, "犠打":14, "併殺打":6, "長打率":0.294, "出塁率":0.299, "OPS":0.593, "RC27":3.26, "XR27":3.3 } +{ "id":"角中勝也", "team":"ロッテ", "打率":0.312, "試合数":128, "打席":525, "打数":477, "安打":149, "本塁打":3, "打点":61, "盗塁":8, "四球":38, "死球":5, "三振":68, "犠打":1, "併殺打":9, "長打率":0.415, "出塁率":0.366, "OPS":0.782, "RC27":5.67, "XR27":5.43 } +{ "id":"平野恵一", "team":"阪神", "打率":0.245, "試合数":134, "打席":519, "打数":458, "安打":112, "本塁打":1, "打点":24, "盗塁":6, "四球":42, "死球":4, "三振":61, "犠打":15, "併殺打":4, "長打率":0.271, "出塁率":0.313, "OPS":0.584, "RC27":3.12, "XR27":3.13 } +{ "id":"ペーニャ", "team":"ソフトバンク", "打率":0.28, "試合数":130, "打席":507, "打数":461, "安打":129, "本塁打":21, "打点":76, "盗塁":2, "四球":35, "死球":8, "三振":130, "犠打":0, "併殺打":11, "長打率":0.49, "出塁率":0.339, "OPS":0.829, "RC27":5.8, "XR27":5.68 } +{ "id":"川端慎吾", "team":"ヤクルト", "打率":0.298, "試合数":125, "打席":507, "打数":453, "安打":135, "本塁打":4, "打点":49, "盗塁":3, "四球":35, "死球":2, "三振":56, "犠打":13, "併殺打":7, "長打率":0.38, "出塁率":0.348, "OPS":0.728, "RC27":4.82, "XR27":4.72 } +{ "id":"田中賢介", "team":"日本ハム", "打率":0.3, "試合数":114, "打席":505, "打数":457, "安打":137, "本塁打":3, "打点":32, "盗塁":13, "四球":35, "死球":2, "三振":36, "犠打":8, "併殺打":2, "長打率":0.363, "出塁率":0.35, "OPS":0.713, "RC27":4.81, "XR27":4.7 } +{ "id":"ラミレス", "team":"DeNA", "打率":0.3, "試合数":137, "打席":504, "打数":476, "安打":143, "本塁打":19, "打点":76, "盗塁":0, "四球":18, "死球":7, "三振":60, "犠打":0, "併殺打":18, "長打率":0.473, "出塁率":0.333, "OPS":0.806, "RC27":5.26, "XR27":5.13 } +{ "id":"今江敏晃", "team":"ロッテ", "打率":0.253, "試合数":136, "打席":501, "打数":446, "安打":113, "本塁打":6, "打点":47, "盗塁":0, "四球":19, "死球":5, "三振":37, "犠打":23, "併殺打":15, "長打率":0.354, "出塁率":0.287, "OPS":0.641, "RC27":2.98, "XR27":3.08 } +{ "id":"中村紀洋", "team":"DeNA", "打率":0.274, "試合数":126, "打席":500, "打数":442, "安打":121, "本塁打":11, "打点":61, "盗塁":1, "四球":50, "死球":2, "三振":72, "犠打":0, "併殺打":14, "長打率":0.407, "出塁率":0.346, "OPS":0.753, "RC27":4.86, "XR27":4.87 } +{ "id":"中村剛也", "team":"西武", "打率":0.231, "試合数":123, "打席":498, "打数":432, "安打":100, "本塁打":27, "打点":79, "盗塁":2, "四球":56, "死球":9, "三振":125, "犠打":0, "併殺打":11, "長打率":0.461, "出塁率":0.331, "OPS":0.792, "RC27":5.04, "XR27":5.16 } +{ "id":"稲葉篤紀", "team":"日本ハム", "打率":0.29, "試合数":127, "打席":497, "打数":449, "安打":130, "本塁打":10, "打点":61, "盗塁":0, "四球":32, "死球":5, "三振":70, "犠打":8, "併殺打":3, "長打率":0.421, "出塁率":0.342, "OPS":0.762, "RC27":5.38, "XR27":5.22 } +{ "id":"畠山和洋", "team":"ヤクルト", "打率":0.266, "試合数":121, "打席":497, "打数":455, "安打":121, "本塁打":13, "打点":55, "盗塁":2, "四球":37, "死球":2, "三振":64, "犠打":1, "併殺打":13, "長打率":0.402, "出塁率":0.323, "OPS":0.725, "RC27":4.39, "XR27":4.35 } +{ "id":"森野将彦", "team":"中日", "打率":0.249, "試合数":124, "打席":494, "打数":434, "安打":108, "本塁打":6, "打点":50, "盗塁":1, "四球":49, "死球":3, "三振":68, "犠打":4, "併殺打":13, "長打率":0.348, "出塁率":0.327, "OPS":0.674, "RC27":3.87, "XR27":3.89 } +{ "id":"新井貴浩", "team":"阪神", "打率":0.25, "試合数":122, "打席":493, "打数":460, "安打":115, "本塁打":9, "打点":52, "盗塁":1, "四球":30, "死球":1, "三振":85, "犠打":0, "併殺打":12, "長打率":0.363, "出塁率":0.296, "OPS":0.659, "RC27":3.43, "XR27":3.4 } +{ "id":"フェルナンデス", "team":"楽天", "打率":0.243, "試合数":129, "打席":489, "打数":440, "安打":107, "本塁打":3, "打点":51, "盗塁":2, "四球":44, "死球":2, "三振":67, "犠打":0, "併殺打":11, "長打率":0.311, "出塁率":0.313, "OPS":0.624, "RC27":3.25, "XR27":3.28 } +{ "id":"銀次", "team":"楽天", "打率":0.28, "試合数":126, "打席":485, "打数":432, "安打":121, "本塁打":4, "打点":45, "盗塁":8, "四球":22, "死球":4, "三振":37, "犠打":23, "併殺打":4, "長打率":0.354, "出塁率":0.318, "OPS":0.672, "RC27":3.98, "XR27":3.94 } +{ "id":"川端崇義", "team":"オリックス", "打率":0.266, "試合数":125, "打席":484, "打数":429, "安打":114, "本塁打":2, "打点":27, "盗塁":6, "四球":22, "死球":9, "三振":59, "犠打":24, "併殺打":8, "長打率":0.317, "出塁率":0.315, "OPS":0.632, "RC27":3.36, "XR27":3.25 } +{ "id":"岡田幸文", "team":"ロッテ", "打率":0.262, "試合数":131, "打席":482, "打数":431, "安打":113, "本塁打":0, "打点":18, "盗塁":23, "四球":18, "死球":6, "三振":47, "犠打":25, "併殺打":4, "長打率":0.288, "出塁率":0.3, "OPS":0.587, "RC27":2.82, "XR27":2.85 } +{ "id":"長谷川勇也", "team":"ソフトバンク", "打率":0.278, "試合数":126, "打席":473, "打数":403, "安打":112, "本塁打":4, "打点":37, "盗塁":16, "四球":40, "死球":9, "三振":87, "犠打":16, "併殺打":4, "長打率":0.36, "出塁率":0.352, "OPS":0.712, "RC27":4.77, "XR27":4.76 } +{ "id":"マートン", "team":"阪神", "打率":0.26, "試合数":121, "打席":473, "打数":453, "安打":118, "本塁打":5, "打点":38, "盗塁":2, "四球":18, "死球":1, "三振":56, "犠打":0, "併殺打":14, "長打率":0.342, "出塁率":0.29, "OPS":0.632, "RC27":3.01, "XR27":2.96 } +{ "id":"栗山巧", "team":"西武", "打率":0.289, "試合数":103, "打席":467, "打数":394, "安打":114, "本塁打":2, "打点":33, "盗塁":3, "四球":52, "死球":6, "三振":62, "犠打":12, "併殺打":4, "長打率":0.353, "出塁率":0.378, "OPS":0.731, "RC27":5.28, "XR27":5.18 } +{ "id":"浅村栄斗", "team":"西武", "打率":0.245, "試合数":114, "打席":459, "打数":404, "安打":99, "本塁打":7, "打点":37, "盗塁":13, "四球":34, "死球":4, "三振":63, "犠打":13, "併殺打":9, "長打率":0.376, "出塁率":0.307, "OPS":0.683, "RC27":3.79, "XR27":3.84 } +{ "id":"谷繁元信", "team":"中日", "打率":0.228, "試合数":134, "打席":458, "打数":386, "安打":88, "本塁打":5, "打点":32, "盗塁":0, "四球":52, "死球":4, "三振":67, "犠打":14, "併殺打":15, "長打率":0.303, "出塁率":0.324, "OPS":0.627, "RC27":3.11, "XR27":3.16 } +{ "id":"秋山翔吾", "team":"西武", "打率":0.293, "試合数":107, "打席":450, "打数":403, "安打":118, "本塁打":4, "打点":37, "盗塁":10, "四球":28, "死球":3, "三振":70, "犠打":15, "併殺打":7, "長打率":0.404, "出塁率":0.343, "OPS":0.747, "RC27":4.82, "XR27":4.63 } +{ "id":"筒香嘉智", "team":"DeNA", "打率":0.218, "試合数":108, "打席":446, "打数":386, "安打":84, "本塁打":10, "打点":45, "盗塁":1, "四球":51, "死球":2, "三振":102, "犠打":2, "併殺打":4, "長打率":0.352, "出塁率":0.309, "OPS":0.661, "RC27":3.8, "XR27":3.98 } +{ "id":"高橋由伸", "team":"巨人", "打率":0.239, "試合数":130, "打席":442, "打数":368, "安打":88, "本塁打":8, "打点":56, "盗塁":2, "四球":61, "死球":8, "三振":77, "犠打":1, "併殺打":14, "長打率":0.351, "出塁率":0.356, "OPS":0.707, "RC27":4.23, "XR27":4.32 } +{ "id":"里崎智也", "team":"ロッテ", "打率":0.244, "試合数":120, "打席":439, "打数":385, "安打":94, "本塁打":9, "打点":41, "盗塁":0, "四球":33, "死球":4, "三振":80, "犠打":14, "併殺打":11, "長打率":0.345, "出塁率":0.308, "OPS":0.654, "RC27":3.38, "XR27":3.46 } +{ "id":"松井稼頭央", "team":"楽天", "打率":0.266, "試合数":106, "打席":433, "打数":402, "安打":107, "本塁打":9, "打点":43, "盗塁":14, "四球":26, "死球":1, "三振":55, "犠打":3, "併殺打":6, "長打率":0.408, "出塁率":0.312, "OPS":0.72, "RC27":4.58, "XR27":4.45 } +{ "id":"大引啓次", "team":"オリックス", "打率":0.224, "試合数":110, "打席":432, "打数":352, "安打":79, "本塁打":6, "打点":20, "盗塁":6, "四球":46, "死球":3, "三振":74, "犠打":28, "併殺打":8, "長打率":0.315, "出塁率":0.317, "OPS":0.632, "RC27":3.33, "XR27":3.44 } +{ "id":"牧田明久", "team":"楽天", "打率":0.225, "試合数":123, "打席":425, "打数":378, "安打":85, "本塁打":9, "打点":53, "盗塁":5, "四球":23, "死球":9, "三振":49, "犠打":10, "併殺打":11, "長打率":0.331, "出塁率":0.283, "OPS":0.613, "RC27":2.68, "XR27":2.86 } +{ "id":"バレンティン", "team":"ヤクルト", "打率":0.272, "試合数":106, "打席":422, "打数":353, "安打":96, "本塁打":31, "打点":81, "盗塁":1, "四球":64, "死球":3, "三振":92, "犠打":0, "併殺打":14, "長打率":0.572, "出塁率":0.386, "OPS":0.958, "RC27":7.34, "XR27":7.35 } +{ "id":"炭谷銀仁朗", "team":"西武", "打率":0.194, "試合数":139, "打席":414, "打数":360, "安打":70, "本塁打":0, "打点":23, "盗塁":0, "四球":17, "死球":1, "三振":74, "犠打":35, "併殺打":9, "長打率":0.233, "出塁率":0.232, "OPS":0.466, "RC27":1.17, "XR27":1.23 } +{ "id":"金本知憲", "team":"阪神", "打率":0.258, "試合数":126, "打席":406, "打数":356, "安打":92, "本塁打":6, "打点":30, "盗塁":3, "四球":49, "死球":0, "三振":54, "犠打":0, "併殺打":2, "長打率":0.36, "出塁率":0.347, "OPS":0.707, "RC27":4.78, "XR27":4.74 } +{ "id":"T-岡田", "team":"オリックス", "打率":0.28, "試合数":103, "打席":404, "打数":378, "安打":106, "本塁打":10, "打点":56, "盗塁":4, "四球":22, "死球":4, "三振":81, "犠打":0, "併殺打":5, "長打率":0.418, "出塁率":0.327, "OPS":0.745, "RC27":4.87, "XR27":4.7 } +{ "id":"天谷宗一郎", "team":"広島", "打率":0.265, "試合数":108, "打席":397, "打数":359, "安打":95, "本塁打":6, "打点":25, "盗塁":12, "四球":32, "死球":3, "三振":64, "犠打":2, "併殺打":3, "長打率":0.373, "出塁率":0.329, "OPS":0.702, "RC27":4.33, "XR27":4.25 } +{ "id":"宮本慎也", "team":"ヤクルト", "打率":0.267, "試合数":110, "打席":394, "打数":356, "安打":95, "本塁打":3, "打点":23, "盗塁":1, "四球":16, "死球":5, "三振":38, "犠打":16, "併殺打":9, "長打率":0.306, "出塁率":0.307, "OPS":0.613, "RC27":3.08, "XR27":3.05 } +{ "id":"松田宣浩", "team":"ソフトバンク", "打率":0.3, "試合数":95, "打席":390, "打数":360, "安打":108, "本塁打":9, "打点":56, "盗塁":16, "四球":27, "死球":1, "三振":63, "犠打":0, "併殺打":7, "長打率":0.492, "出塁率":0.349, "OPS":0.84, "RC27":5.87, "XR27":5.59 } +{ "id":"新井良太", "team":"阪神", "打率":0.28, "試合数":110, "打席":370, "打数":322, "安打":90, "本塁打":11, "打点":32, "盗塁":1, "四球":38, "死球":6, "三振":73, "犠打":3, "併殺打":7, "長打率":0.438, "出塁率":0.365, "OPS":0.803, "RC27":5.77, "XR27":5.65 } +{ "id":"スケールズ", "team":"オリックス", "打率":0.262, "試合数":85, "打席":362, "打数":305, "安打":80, "本塁打":5, "打点":23, "盗塁":4, "四球":47, "死球":8, "三振":103, "犠打":0, "併殺打":3, "長打率":0.384, "出塁率":0.373, "OPS":0.757, "RC27":5.6, "XR27":5.54 } +{ "id":"ブランコ", "team":"中日", "打率":0.248, "試合数":96, "打席":359, "打数":311, "安打":77, "本塁打":24, "打点":65, "盗塁":2, "四球":40, "死球":5, "三振":84, "犠打":0, "併殺打":5, "長打率":0.511, "出塁率":0.34, "OPS":0.851, "RC27":6.06, "XR27":6.22 } +{ "id":"小久保裕紀", "team":"ソフトバンク", "打率":0.239, "試合数":103, "打席":357, "打数":331, "安打":79, "本塁打":4, "打点":34, "盗塁":0, "四球":20, "死球":0, "三振":67, "犠打":3, "併殺打":6, "長打率":0.308, "出塁率":0.28, "OPS":0.588, "RC27":2.8, "XR27":2.89 } +{ "id":"鉄平", "team":"楽天", "打率":0.251, "試合数":121, "打席":355, "打数":315, "安打":79, "本塁打":1, "打点":33, "盗塁":8, "四球":11, "死球":6, "三振":45, "犠打":18, "併殺打":7, "長打率":0.302, "出塁率":0.285, "OPS":0.586, "RC27":2.62, "XR27":2.75 } +{ "id":"大和", "team":"阪神", "打率":0.257, "試合数":128, "打席":349, "打数":311, "安打":80, "本塁打":0, "打点":26, "盗塁":17, "四球":15, "死球":4, "三振":46, "犠打":19, "併殺打":3, "長打率":0.318, "出塁率":0.3, "OPS":0.618, "RC27":3.19, "XR27":3.1 } +{ "id":"今宮健太", "team":"ソフトバンク", "打率":0.238, "試合数":126, "打席":343, "打数":307, "安打":73, "本塁打":2, "打点":14, "盗塁":8, "四球":10, "死球":3, "三振":75, "犠打":21, "併殺打":2, "長打率":0.29, "出塁率":0.267, "OPS":0.557, "RC27":2.44, "XR27":2.51 } +{ "id":"丸佳浩", "team":"広島", "打率":0.247, "試合数":106, "打席":339, "打数":283, "安打":70, "本塁打":4, "打点":22, "盗塁":14, "四球":46, "死球":2, "三振":59, "犠打":5, "併殺打":4, "長打率":0.353, "出塁率":0.353, "OPS":0.707, "RC27":4.62, "XR27":4.7 } +{ "id":"廣瀬純", "team":"広島", "打率":0.241, "試合数":102, "打席":337, "打数":294, "安打":71, "本塁打":6, "打点":33, "盗塁":0, "四球":24, "死球":11, "三振":55, "犠打":7, "併殺打":3, "長打率":0.347, "出塁率":0.321, "OPS":0.668, "RC27":4.02, "XR27":4.02 } +{ "id":"金城龍彦", "team":"DeNA", "打率":0.238, "試合数":129, "打席":331, "打数":294, "安打":70, "本塁打":3, "打点":18, "盗塁":2, "四球":26, "死球":6, "三振":37, "犠打":3, "併殺打":1, "長打率":0.306, "出塁率":0.311, "OPS":0.617, "RC27":3.58, "XR27":3.65 } +{ "id":"鶴岡慎也", "team":"日本ハム", "打率":0.266, "試合数":116, "打席":328, "打数":289, "安打":77, "本塁打":0, "打点":25, "盗塁":3, "四球":14, "死球":1, "三振":41, "犠打":23, "併殺打":11, "長打率":0.308, "出塁率":0.302, "OPS":0.61, "RC27":2.68, "XR27":2.61 } +{ "id":"ホフパワー", "team":"日本ハム", "打率":0.247, "試合数":109, "打席":325, "打数":296, "安打":73, "本塁打":14, "打点":37, "盗塁":1, "四球":25, "死球":3, "三振":81, "犠打":0, "併殺打":3, "長打率":0.432, "出塁率":0.311, "OPS":0.743, "RC27":4.74, "XR27":4.77 } +{ "id":"嶋基宏", "team":"楽天", "打率":0.291, "試合数":91, "打席":316, "打数":265, "安打":77, "本塁打":1, "打点":8, "盗塁":3, "四球":33, "死球":4, "三振":51, "犠打":13, "併殺打":5, "長打率":0.332, "出塁率":0.376, "OPS":0.708, "RC27":4.53, "XR27":4.47 } +{ "id":"金子誠", "team":"日本ハム", "打率":0.227, "試合数":103, "打席":311, "打数":277, "安打":63, "本塁打":0, "打点":22, "盗塁":1, "四球":15, "死球":0, "三振":53, "犠打":17, "併殺打":2, "長打率":0.292, "出塁率":0.265, "OPS":0.558, "RC27":2.3, "XR27":2.31 } +{ "id":"石川雄洋", "team":"DeNA", "打率":0.285, "試合数":80, "打席":304, "打数":263, "安打":75, "本塁打":1, "打点":14, "盗塁":7, "四球":24, "死球":3, "三振":49, "犠打":13, "併殺打":3, "長打率":0.342, "出塁率":0.351, "OPS":0.693, "RC27":4.21, "XR27":4.11 } +{ "id":"平田良介", "team":"中日", "打率":0.216, "試合数":91, "打席":301, "打数":269, "安打":58, "本塁打":11, "打点":32, "盗塁":1, "四球":28, "死球":1, "三振":59, "犠打":1, "併殺打":9, "長打率":0.361, "出塁率":0.29, "OPS":0.651, "RC27":3.26, "XR27":3.47 } +{ "id":"大崎雄太朗", "team":"西武", "打率":0.269, "試合数":107, "打席":298, "打数":260, "安打":70, "本塁打":1, "打点":22, "盗塁":1, "四球":17, "死球":7, "三振":27, "犠打":10, "併殺打":4, "長打率":0.346, "出塁率":0.326, "OPS":0.673, "RC27":4.06, "XR27":4.04 } +{ "id":"ガルシア", "team":"楽天", "打率":0.227, "試合数":77, "打席":296, "打数":269, "安打":61, "本塁打":7, "打点":30, "盗塁":0, "四球":23, "死球":2, "三振":65, "犠打":0, "併殺打":6, "長打率":0.349, "出塁率":0.291, "OPS":0.64, "RC27":3.27, "XR27":3.38 } +{ "id":"ブラゼル", "team":"阪神", "打率":0.233, "試合数":98, "打席":295, "打数":275, "安打":64, "本塁打":12, "打点":43, "盗塁":0, "四球":18, "死球":2, "三振":72, "犠打":0, "併殺打":16, "長打率":0.404, "出塁率":0.285, "OPS":0.688, "RC27":3.15, "XR27":3.21 } +{ "id":"清田育宏", "team":"ロッテ", "打率":0.281, "試合数":87, "打席":292, "打数":253, "安打":71, "本塁打":3, "打点":29, "盗塁":5, "四球":34, "死球":0, "三振":46, "犠打":3, "併殺打":5, "長打率":0.391, "出塁率":0.363, "OPS":0.755, "RC27":5, "XR27":4.9 } +{ "id":"枡田慎太郎", "team":"楽天", "打率":0.295, "試合数":79, "打席":290, "打数":254, "安打":75, "本塁打":5, "打点":32, "盗塁":1, "四球":19, "死球":5, "三振":56, "犠打":7, "併殺打":2, "長打率":0.433, "出塁率":0.35, "OPS":0.783, "RC27":5.55, "XR27":5.51 } +{ "id":"内村賢介", "team":"DeNA", "打率":0.237, "試合数":77, "打席":289, "打数":236, "安打":56, "本塁打":0, "打点":18, "盗塁":18, "四球":31, "死球":0, "三振":40, "犠打":21, "併殺打":4, "長打率":0.263, "出塁率":0.325, "OPS":0.587, "RC27":3.19, "XR27":3.22 } +{ "id":"森岡良介", "team":"ヤクルト", "打率":0.249, "試合数":100, "打席":282, "打数":245, "安打":61, "本塁打":1, "打点":18, "盗塁":1, "四球":11, "死球":5, "三振":44, "犠打":19, "併殺打":1, "長打率":0.318, "出塁率":0.293, "OPS":0.611, "RC27":3.16, "XR27":3.17 } +{ "id":"東出輝裕", "team":"広島", "打率":0.247, "試合数":91, "打席":282, "打数":247, "安打":61, "本塁打":0, "打点":6, "盗塁":1, "四球":13, "死球":3, "三振":22, "犠打":19, "併殺打":3, "長打率":0.279, "出塁率":0.293, "OPS":0.572, "RC27":2.62, "XR27":2.57 } +{ "id":"藤村大介", "team":"巨人", "打率":0.252, "試合数":109, "打席":279, "打数":238, "安打":60, "本塁打":0, "打点":10, "盗塁":14, "四球":15, "死球":0, "三振":35, "犠打":26, "併殺打":2, "長打率":0.298, "出塁率":0.296, "OPS":0.595, "RC27":2.84, "XR27":2.79 } +{ "id":"大松尚逸", "team":"ロッテ", "打率":0.198, "試合数":89, "打席":275, "打数":258, "安打":51, "本塁打":5, "打点":22, "盗塁":0, "四球":9, "死球":2, "三振":46, "犠打":3, "併殺打":7, "長打率":0.291, "出塁率":0.228, "OPS":0.519, "RC27":1.7, "XR27":1.91 } +{ "id":"ホワイトセル", "team":"ロッテ", "打率":0.309, "試合数":63, "打席":257, "打数":223, "安打":69, "本塁打":9, "打点":43, "盗塁":0, "四球":29, "死球":4, "三振":75, "犠打":0, "併殺打":2, "長打率":0.489, "出塁率":0.397, "OPS":0.886, "RC27":7.37, "XR27":7.16 } +{ "id":"谷佳知", "team":"巨人", "打率":0.258, "試合数":89, "打席":255, "打数":229, "安打":59, "本塁打":3, "打点":22, "盗塁":1, "四球":14, "死球":0, "三振":48, "犠打":11, "併殺打":5, "長打率":0.319, "出塁率":0.299, "OPS":0.618, "RC27":2.97, "XR27":2.97 } +{ "id":"梶谷隆幸", "team":"DeNA", "打率":0.179, "試合数":80, "打席":252, "打数":223, "安打":40, "本塁打":2, "打点":11, "盗塁":5, "四球":21, "死球":1, "三振":61, "犠打":7, "併殺打":4, "長打率":0.256, "出塁率":0.253, "OPS":0.509, "RC27":1.48, "XR27":1.64 } +{ "id":"エルドレッド", "team":"広島", "打率":0.262, "試合数":65, "打席":251, "打数":225, "安打":59, "本塁打":11, "打点":35, "盗塁":0, "四球":20, "死球":3, "三振":67, "犠打":0, "併殺打":3, "長打率":0.453, "出塁率":0.327, "OPS":0.78, "RC27":5.29, "XR27":5.38 } +{ "id":"石原慶幸", "team":"広島", "打率":0.24, "試合数":77, "打席":250, "打数":217, "安打":52, "本塁打":1, "打点":22, "盗塁":0, "四球":22, "死球":1, "三振":37, "犠打":9, "併殺打":7, "長打率":0.323, "出塁率":0.311, "OPS":0.634, "RC27":3.11, "XR27":2.99 } +{ "id":"中村悠平", "team":"ヤクルト", "打率":0.254, "試合数":91, "打席":249, "打数":209, "安打":53, "本塁打":1, "打点":15, "盗塁":1, "四球":28, "死球":2, "三振":31, "犠打":8, "併殺打":10, "長打率":0.292, "出塁率":0.344, "OPS":0.636, "RC27":3.23, "XR27":3.25 } +{ "id":"赤松真人", "team":"広島", "打率":0.242, "試合数":76, "打席":246, "打数":207, "安打":50, "本塁打":3, "打点":15, "盗塁":18, "四球":18, "死球":4, "三振":33, "犠打":16, "併殺打":4, "長打率":0.314, "出塁率":0.313, "OPS":0.627, "RC27":3.62, "XR27":3.65 } +{ "id":"相川亮二", "team":"ヤクルト", "打率":0.245, "試合数":72, "打席":244, "打数":220, "安打":54, "本塁打":1, "打点":28, "盗塁":0, "四球":20, "死球":1, "三振":33, "犠打":1, "併殺打":7, "長打率":0.295, "出塁率":0.309, "OPS":0.604, "RC27":2.98, "XR27":3 } +{ "id":"岩本貴裕", "team":"広島", "打率":0.268, "試合数":71, "打席":242, "打数":239, "安打":64, "本塁打":6, "打点":27, "盗塁":1, "四球":2, "死球":0, "三振":43, "犠打":1, "併殺打":5, "長打率":0.406, "出塁率":0.274, "OPS":0.68, "RC27":3.63, "XR27":3.5 } +{ "id":"菊池涼介", "team":"広島", "打率":0.229, "試合数":63, "打席":234, "打数":201, "安打":46, "本塁打":2, "打点":12, "盗塁":4, "四球":6, "死球":1, "三振":42, "犠打":25, "併殺打":5, "長打率":0.294, "出塁率":0.254, "OPS":0.547, "RC27":1.99, "XR27":2.04 } +{ "id":"片岡易之", "team":"西武", "打率":0.225, "試合数":52, "打席":231, "打数":204, "安打":46, "本塁打":2, "打点":19, "盗塁":8, "四球":15, "死球":1, "三振":30, "犠打":7, "併殺打":6, "長打率":0.294, "出塁率":0.277, "OPS":0.571, "RC27":2.19, "XR27":2.44 } +{ "id":"森本稀哲", "team":"DeNA", "打率":0.244, "試合数":108, "打席":230, "打数":201, "安打":49, "本塁打":3, "打点":18, "盗塁":0, "四球":20, "死球":3, "三振":44, "犠打":4, "併殺打":6, "長打率":0.323, "出塁率":0.32, "OPS":0.643, "RC27":3.27, "XR27":3.34 } +{ "id":"鶴岡一成", "team":"DeNA", "打率":0.189, "試合数":102, "打席":229, "打数":201, "安打":38, "本塁打":1, "打点":15, "盗塁":0, "四球":19, "死球":0, "三振":39, "犠打":8, "併殺打":4, "長打率":0.254, "出塁率":0.258, "OPS":0.512, "RC27":1.87, "XR27":1.94 } +{ "id":"松本哲也", "team":"巨人", "打率":0.258, "試合数":83, "打席":229, "打数":198, "安打":51, "本塁打":0, "打点":11, "盗塁":12, "四球":17, "死球":0, "三振":34, "犠打":12, "併殺打":4, "長打率":0.313, "出塁率":0.313, "OPS":0.626, "RC27":3.5, "XR27":3.5 } +{ "id":"藤井彰人", "team":"阪神", "打率":0.248, "試合数":81, "打席":228, "打数":210, "安打":52, "本塁打":1, "打点":10, "盗塁":0, "四球":9, "死球":2, "三振":39, "犠打":7, "併殺打":3, "長打率":0.286, "出塁率":0.285, "OPS":0.571, "RC27":2.59, "XR27":2.54 } +{ "id":"寺内崇幸", "team":"巨人", "打率":0.241, "試合数":103, "打席":225, "打数":191, "安打":46, "本塁打":1, "打点":5, "盗塁":11, "四球":16, "死球":3, "三振":43, "犠打":13, "併殺打":6, "長打率":0.293, "出塁率":0.307, "OPS":0.6, "RC27":2.97, "XR27":3.06 } +{ "id":"上本博紀", "team":"阪神", "打率":0.254, "試合数":62, "打席":224, "打数":197, "安打":50, "本塁打":1, "打点":7, "盗塁":13, "四球":19, "死球":5, "三振":46, "犠打":2, "併殺打":3, "長打率":0.345, "出塁率":0.333, "OPS":0.679, "RC27":4.26, "XR27":4.18 } +{ "id":"多村仁志", "team":"ソフトバンク", "打率":0.25, "試合数":79, "打席":218, "打数":200, "安打":50, "本塁打":4, "打点":20, "盗塁":0, "四球":18, "死球":0, "三振":43, "犠打":0, "併殺打":7, "長打率":0.365, "出塁率":0.312, "OPS":0.677, "RC27":3.57, "XR27":3.53 } +{ "id":"細川亨", "team":"ソフトバンク", "打率":0.157, "試合数":92, "打席":217, "打数":185, "安打":29, "本塁打":2, "打点":13, "盗塁":0, "四球":8, "死球":1, "三振":50, "犠打":21, "併殺打":1, "長打率":0.205, "出塁率":0.194, "OPS":0.399, "RC27":0.72, "XR27":0.98 } +{ "id":"福地寿樹", "team":"ヤクルト", "打率":0.255, "試合数":83, "打席":216, "打数":188, "安打":48, "本塁打":0, "打点":19, "盗塁":12, "四球":16, "死球":4, "三振":29, "犠打":6, "併殺打":3, "長打率":0.335, "出塁率":0.324, "OPS":0.659, "RC27":4.17, "XR27":4.1 } +{ "id":"山崎武司", "team":"中日", "打率":0.209, "試合数":90, "打席":215, "打数":191, "安打":40, "本塁打":1, "打点":13, "盗塁":1, "四球":18, "死球":4, "三振":44, "犠打":0, "併殺打":5, "長打率":0.293, "出塁率":0.288, "OPS":0.582, "RC27":2.72, "XR27":2.79 } +{ "id":"オーティズ", "team":"西武", "打率":0.286, "試合数":64, "打席":214, "打数":199, "安打":57, "本塁打":9, "打点":21, "盗塁":0, "四球":14, "死球":1, "三振":38, "犠打":0, "併殺打":8, "長打率":0.462, "出塁率":0.336, "OPS":0.799, "RC27":5, "XR27":4.91 } +{ "id":"渡辺直人", "team":"DeNA", "打率":0.224, "試合数":70, "打席":213, "打数":174, "安打":39, "本塁打":0, "打点":10, "盗塁":2, "四球":24, "死球":10, "三振":27, "犠打":5, "併殺打":1, "長打率":0.247, "出塁率":0.351, "OPS":0.598, "RC27":3.35, "XR27":3.44 } +{ "id":"柳田悠岐", "team":"ソフトバンク", "打率":0.246, "試合数":68, "打席":212, "打数":195, "安打":48, "本塁打":5, "打点":18, "盗塁":6, "四球":10, "死球":5, "三振":56, "犠打":2, "併殺打":2, "長打率":0.385, "出塁率":0.3, "OPS":0.685, "RC27":4.14, "XR27":4.05 } +{ "id":"高須洋介", "team":"楽天", "打率":0.244, "試合数":58, "打席":212, "打数":176, "安打":43, "本塁打":0, "打点":9, "盗塁":2, "四球":24, "死球":2, "三振":15, "犠打":8, "併殺打":5, "長打率":0.284, "出塁率":0.338, "OPS":0.622, "RC27":3.47, "XR27":3.53 } +{ "id":"ボウカー", "team":"巨人", "打率":0.196, "試合数":69, "打席":204, "打数":184, "安打":36, "本塁打":3, "打点":10, "盗塁":2, "四球":16, "死球":2, "三振":54, "犠打":2, "併殺打":3, "長打率":0.31, "出塁率":0.267, "OPS":0.577, "RC27":2.64, "XR27":2.66 } +{ "id":"ニック", "team":"広島", "打率":0.238, "試合数":52, "打席":204, "打数":181, "安打":43, "本塁打":9, "打点":24, "盗塁":0, "四球":22, "死球":1, "三振":42, "犠打":0, "併殺打":4, "長打率":0.431, "出塁率":0.324, "OPS":0.754, "RC27":4.78, "XR27":4.79 } +{ "id":"福浦和也", "team":"ロッテ", "打率":0.25, "試合数":84, "打席":199, "打数":180, "安打":45, "本塁打":0, "打点":25, "盗塁":0, "四球":15, "死球":1, "三振":27, "犠打":1, "併殺打":3, "長打率":0.267, "出塁率":0.308, "OPS":0.575, "RC27":2.92, "XR27":3.03 } +{ "id":"伊藤光", "team":"オリックス", "打率":0.205, "試合数":66, "打席":197, "打数":176, "安打":36, "本塁打":0, "打点":10, "盗塁":0, "四球":5, "死球":0, "三振":48, "犠打":12, "併殺打":2, "長打率":0.256, "出塁率":0.222, "OPS":0.477, "RC27":1.49, "XR27":1.7 } +{ "id":"エドガー", "team":"巨人", "打率":0.236, "試合数":57, "打席":190, "打数":174, "安打":41, "本塁打":4, "打点":19, "盗塁":0, "四球":13, "死球":1, "三振":46, "犠打":1, "併殺打":6, "長打率":0.374, "出塁率":0.291, "OPS":0.665, "RC27":3.17, "XR27":3.19 } +{ "id":"藤田一也", "team":"楽天", "打率":0.308, "試合数":63, "打席":188, "打数":172, "安打":53, "本塁打":0, "打点":15, "盗塁":5, "四球":4, "死球":0, "三振":10, "犠打":11, "併殺打":4, "長打率":0.378, "出塁率":0.322, "OPS":0.7, "RC27":4.05, "XR27":3.84 } +{ "id":"荻野貴司", "team":"ロッテ", "打率":0.224, "試合数":61, "打席":187, "打数":165, "安打":37, "本塁打":1, "打点":8, "盗塁":13, "四球":11, "死球":5, "三振":17, "犠打":6, "併殺打":0, "長打率":0.273, "出塁率":0.293, "OPS":0.566, "RC27":3.11, "XR27":3.14 } +{ "id":"堂上直倫", "team":"中日", "打率":0.21, "試合数":116, "打席":182, "打数":167, "安打":35, "本塁打":0, "打点":11, "盗塁":1, "四球":5, "死球":1, "三振":34, "犠打":8, "併殺打":6, "長打率":0.257, "出塁率":0.236, "OPS":0.493, "RC27":1.39, "XR27":1.42 } +{ "id":"江川智晃", "team":"ソフトバンク", "打率":0.244, "試合数":56, "打席":175, "打数":160, "安打":39, "本塁打":4, "打点":18, "盗塁":1, "四球":8, "死球":0, "三振":37, "犠打":4, "併殺打":3, "長打率":0.388, "出塁率":0.275, "OPS":0.662, "RC27":3.5, "XR27":3.7 } +{ "id":"上田剛史", "team":"ヤクルト", "打率":0.257, "試合数":50, "打席":173, "打数":148, "安打":38, "本塁打":0, "打点":12, "盗塁":8, "四球":12, "死球":0, "三振":30, "犠打":13, "併殺打":2, "長打率":0.318, "出塁率":0.313, "OPS":0.63, "RC27":3.23, "XR27":3.15 } +{ "id":"小宮山慎二", "team":"阪神", "打率":0.148, "試合数":72, "打席":171, "打数":149, "安打":22, "本塁打":1, "打点":4, "盗塁":0, "四球":11, "死球":2, "三振":32, "犠打":9, "併殺打":4, "長打率":0.174, "出塁率":0.216, "OPS":0.391, "RC27":0.53, "XR27":0.69 } +{ "id":"堂上剛裕", "team":"中日", "打率":0.282, "試合数":85, "打席":166, "打数":156, "安打":44, "本塁打":4, "打点":17, "盗塁":1, "四球":8, "死球":0, "三振":36, "犠打":1, "併殺打":4, "長打率":0.41, "出塁率":0.315, "OPS":0.725, "RC27":4.3, "XR27":4.19 } +{ "id":"坂口智隆", "team":"オリックス", "打率":0.228, "試合数":40, "打席":165, "打数":158, "安打":36, "本塁打":0, "打点":8, "盗塁":2, "四球":5, "死球":0, "三振":13, "犠打":2, "併殺打":0, "長打率":0.253, "出塁率":0.252, "OPS":0.505, "RC27":1.96, "XR27":2.01 } +{ "id":"松中信彦", "team":"ソフトバンク", "打率":0.221, "試合数":65, "打席":164, "打数":136, "安打":30, "本塁打":4, "打点":13, "盗塁":1, "四球":26, "死球":2, "三振":26, "犠打":0, "併殺打":1, "長打率":0.324, "出塁率":0.354, "OPS":0.677, "RC27":4.6, "XR27":4.69 } +{ "id":"倉義和", "team":"広島", "打率":0.195, "試合数":70, "打席":160, "打数":133, "安打":26, "本塁打":1, "打点":15, "盗塁":1, "四球":14, "死球":5, "三振":20, "犠打":8, "併殺打":6, "長打率":0.263, "出塁率":0.296, "OPS":0.559, "RC27":2.23, "XR27":2.28 } +{ "id":"鈴木大地", "team":"ロッテ", "打率":0.274, "試合数":62, "打席":160, "打数":135, "安打":37, "本塁打":0, "打点":11, "盗塁":0, "四球":13, "死球":1, "三振":23, "犠打":10, "併殺打":1, "長打率":0.326, "出塁率":0.34, "OPS":0.666, "RC27":4.07, "XR27":4 } +{ "id":"大野奨太", "team":"日本ハム", "打率":0.171, "試合数":70, "打席":159, "打数":140, "安打":24, "本塁打":2, "打点":11, "盗塁":0, "四球":8, "死球":2, "三振":30, "犠打":9, "併殺打":2, "長打率":0.243, "出塁率":0.227, "OPS":0.47, "RC27":1.2, "XR27":1.35 } +{ "id":"北川博敏", "team":"オリックス", "打率":0.221, "試合数":59, "打席":159, "打数":140, "安打":31, "本塁打":1, "打点":11, "盗塁":0, "四球":17, "死球":0, "三振":23, "犠打":0, "併殺打":3, "長打率":0.286, "出塁率":0.302, "OPS":0.588, "RC27":2.95, "XR27":3.12 } +{ "id":"柴田講平", "team":"阪神", "打率":0.234, "試合数":73, "打席":157, "打数":128, "安打":30, "本塁打":0, "打点":2, "盗塁":4, "四球":16, "死球":1, "三振":23, "犠打":11, "併殺打":1, "長打率":0.273, "出塁率":0.322, "OPS":0.595, "RC27":3.03, "XR27":3.12 } +{ "id":"小池正晃", "team":"DeNA", "打率":0.192, "試合数":88, "打席":155, "打数":130, "安打":25, "本塁打":3, "打点":19, "盗塁":0, "四球":10, "死球":8, "三振":31, "犠打":6, "併殺打":5, "長打率":0.292, "出塁率":0.289, "OPS":0.581, "RC27":2.43, "XR27":2.63 } +{ "id":"西川遥輝", "team":"日本ハム", "打率":0.239, "試合数":71, "打席":155, "打数":134, "安打":32, "本塁打":2, "打点":13, "盗塁":7, "四球":14, "死球":0, "三振":34, "犠打":7, "併殺打":0, "長打率":0.343, "出塁率":0.311, "OPS":0.654, "RC27":4.18, "XR27":4.11 } +{ "id":"雄平", "team":"ヤクルト", "打率":0.28, "試合数":47, "打席":153, "打数":143, "安打":40, "本塁打":0, "打点":8, "盗塁":2, "四球":7, "死球":0, "三振":19, "犠打":3, "併殺打":0, "長打率":0.308, "出塁率":0.313, "OPS":0.621, "RC27":3.4, "XR27":3.31 } +{ "id":"スレッジ", "team":"日本ハム", "打率":0.232, "試合数":47, "打席":152, "打数":138, "安打":32, "本塁打":5, "打点":23, "盗塁":0, "四球":12, "死球":2, "三振":47, "犠打":0, "併殺打":2, "長打率":0.406, "出塁率":0.303, "OPS":0.708, "RC27":4.22, "XR27":4.22 } +{ "id":"下園辰哉", "team":"DeNA", "打率":0.252, "試合数":90, "打席":150, "打数":139, "安打":35, "本塁打":0, "打点":14, "盗塁":0, "四球":10, "死球":0, "三振":21, "犠打":1, "併殺打":6, "長打率":0.324, "出塁率":0.302, "OPS":0.626, "RC27":2.93, "XR27":2.77 } +{ "id":"松井淳", "team":"ヤクルト", "打率":0.287, "試合数":46, "打席":150, "打数":143, "安打":41, "本塁打":5, "打点":15, "盗塁":1, "四球":6, "死球":1, "三振":26, "犠打":0, "併殺打":5, "長打率":0.462, "出塁率":0.32, "OPS":0.782, "RC27":4.79, "XR27":4.6 } +{ "id":"野中信吾", "team":"オリックス", "打率":0.217, "試合数":74, "打席":149, "打数":120, "安打":26, "本塁打":0, "打点":7, "盗塁":12, "四球":11, "死球":2, "三振":20, "犠打":16, "併殺打":2, "長打率":0.275, "出塁率":0.293, "OPS":0.568, "RC27":2.4, "XR27":2.44 } +{ "id":"古城茂幸", "team":"巨人", "打率":0.209, "試合数":65, "打席":149, "打数":129, "安打":27, "本塁打":0, "打点":8, "盗塁":2, "四球":12, "死球":0, "三振":28, "犠打":6, "併殺打":5, "長打率":0.256, "出塁率":0.273, "OPS":0.529, "RC27":1.96, "XR27":2.07 } \ No newline at end of file diff --git a/processor/src/test/resources/shogun.json b/processor/src/test/resources/shogun.json new file mode 100644 index 0000000..469b1bc --- /dev/null +++ b/processor/src/test/resources/shogun.json @@ -0,0 +1,20 @@ +{ + "method": "AROW", + "converter": { + "num_filter_types": {}, + "num_filter_rules": [], + "string_filter_types": {}, + "string_filter_rules": [], + "num_types": {}, + "num_rules": [], + "string_types": { + "unigram": { "method": "ngram", "char_num": "1" } + }, + "string_rules": [ + { "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } + ] + }, + "parameter": { + "regularization_weight" : 1.0 + } +} diff --git a/processor/src/test/resources/shogun_data.json b/processor/src/test/resources/shogun_data.json new file mode 100644 index 0000000..8096d19 --- /dev/null +++ b/processor/src/test/resources/shogun_data.json @@ -0,0 +1,44 @@ +{"label":"徳川","name":"家康"} +{"label":"徳川","name":"秀忠"} +{"label":"徳川","name":"家光"} +{"label":"徳川","name":"家綱"} +{"label":"徳川","name":"綱吉"} +{"label":"徳川","name":"家宣"} +{"label":"徳川","name":"家継"} +{"label":"徳川","name":"吉宗"} +{"label":"徳川","name":"家重"} +{"label":"徳川","name":"家治"} +{"label":"徳川","name":"家斉"} +{"label":"徳川","name":"家慶"} +{"label":"徳川","name":"家定"} +{"label":"徳川","name":"家茂"} +{"label":"足利","name":"尊氏"} +{"label":"足利","name":"義詮"} +{"label":"足利","name":"義満"} +{"label":"足利","name":"義持"} +{"label":"足利","name":"義量"} +{"label":"足利","name":"義教"} +{"label":"足利","name":"義勝"} +{"label":"足利","name":"義政"} +{"label":"足利","name":"義尚"} +{"label":"足利","name":"義稙"} +{"label":"足利","name":"義澄"} +{"label":"足利","name":"義稙"} +{"label":"足利","name":"義晴"} +{"label":"足利","name":"義輝"} +{"label":"足利","name":"義栄"} +{"label":"北条","name":"時政"} +{"label":"北条","name":"義時"} +{"label":"北条","name":"泰時"} +{"label":"北条","name":"経時"} +{"label":"北条","name":"時頼"} +{"label":"北条","name":"長時"} +{"label":"北条","name":"政村"} +{"label":"北条","name":"時宗"} +{"label":"北条","name":"貞時"} +{"label":"北条","name":"師時"} +{"label":"北条","name":"宗宣"} +{"label":"北条","name":"煕時"} +{"label":"北条","name":"基時"} +{"label":"北条","name":"高時"} +{"label":"北条","name":"貞顕"} diff --git a/processor/src/test/resources/yarn-site.xml.dist b/processor/src/test/resources/yarn-site.xml.dist new file mode 100644 index 0000000..ad2b61a --- /dev/null +++ b/processor/src/test/resources/yarn-site.xml.dist @@ -0,0 +1,136 @@ + + + + + + yarn.acl.enable + true + + + yarn.admin.acl + * + + + yarn.resourcemanager.address + [host]:[port] + + + yarn.resourcemanager.admin.address + [host]:[port] + + + yarn.resourcemanager.scheduler.address + [host]:[port] + + + yarn.resourcemanager.resource-tracker.address + [host]:[port] + + + yarn.resourcemanager.webapp.address + [host]:[port] + + + yarn.resourcemanager.webapp.https.address + [host]:[port] + + + yarn.resourcemanager.client.thread-count + 50 + + + yarn.resourcemanager.scheduler.client.thread-count + 50 + + + yarn.resourcemanager.admin.client.thread-count + 1 + + + yarn.scheduler.minimum-allocation-mb + 1024 + + + yarn.scheduler.increment-allocation-mb + 512 + + + yarn.scheduler.maximum-allocation-mb + 6538 + + + yarn.scheduler.minimum-allocation-vcores + 1 + + + yarn.scheduler.increment-allocation-vcores + 1 + + + yarn.scheduler.maximum-allocation-vcores + 4 + + + yarn.resourcemanager.amliveliness-monitor.interval-ms + 1000 + + + yarn.am.liveness-monitor.expiry-interval-ms + 600000 + + + yarn.resourcemanager.am.max-attempts + 2 + + + yarn.resourcemanager.container.liveness-monitor.interval-ms + 600000 + + + yarn.resourcemanager.nm.liveness-monitor.interval-ms + 1000 + + + yarn.nm.liveness-monitor.expiry-interval-ms + 600000 + + + yarn.resourcemanager.resource-tracker.client.thread-count + 50 + + + yarn.application.classpath + $HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/* + + + yarn.resourcemanager.scheduler.class + org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler + + + yarn.scheduler.fair.user-as-default-queue + true + + + yarn.scheduler.fair.preemption + false + + + yarn.scheduler.fair.sizebasedweight + false + + + yarn.scheduler.fair.assignmultiple + false + + + yarn.resourcemanager.max-completed-applications + 10000 + + + yarn.nodemanager.aux-services + + + diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/HasKafkaPath.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/HasKafkaPath.scala new file mode 100644 index 0000000..3452eb6 --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/HasKafkaPath.scala @@ -0,0 +1,40 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import java.io.{FileNotFoundException, FileInputStream} +import java.util.Properties + +import org.scalatest._ + +trait HasKafkaPath extends ShouldMatchers { + lazy val kafkaPath: String = { + val kafkaXmlPath = "src/test/resources/kafka.xml" + + val is = try { + Some(new FileInputStream(kafkaXmlPath)) + } catch { + case _: FileNotFoundException => + None + } + is shouldBe a[Some[_]] + + val properties = new Properties() + properties.loadFromXML(is.get) + + properties.getProperty("path") + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/HybridProcessorSpec.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/HybridProcessorSpec.scala new file mode 100644 index 0000000..3f42a8f --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/HybridProcessorSpec.scala @@ -0,0 +1,288 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import java.io.{FileNotFoundException, FileInputStream} +import java.util.Properties + +import org.scalatest._ +import org.apache.spark.SparkContext +import org.apache.spark.sql.SQLContext +import org.apache.spark.sql.catalyst.types.{StringType, LongType, StructField, StructType} + +class HybridProcessorSpec + extends FlatSpec + with ShouldMatchers + with BeforeAndAfterAll { + val sc = new SparkContext("local[3]", "HybridStreamSpec") + val sqlc = new SQLContext(sc) + + "HybridProcessor" should "throw an exception for an invalid storage location" taggedAs (LocalTest) in { + the[IllegalArgumentException] thrownBy { + new HybridProcessor(sc, sqlc, "myLoc", Nil) + } should have message "'myLoc' is not a valid storage specification" + } + + it should "throw an exception for an invalid stream location" taggedAs (LocalTest) in { + the[IllegalArgumentException] thrownBy { + new HybridProcessor(sc, sqlc, "file:///tmp", "myLoc" :: Nil) + } should have message "'myLoc' is not a valid stream specification" + } + + it should "throw an exception for more than one stream location" taggedAs (LocalTest) in { + the[IllegalArgumentException] thrownBy { + new HybridProcessor(sc, sqlc, "file:///tmp", "myLoc" :: "yourLoc" :: Nil) + } should have message ("requirement failed: " + + "More than one stream location is not supported at the moment.") + } + + "Static-only processing (empty source)" should "end when the processing is done" taggedAs (LocalTest) in { + val startTime = System.nanoTime() + val processor = new HybridProcessor(sc, sqlc, "empty", Nil) + processor.start(rdd => rdd) + processor.awaitTermination() + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 5e9 // less than 5 seconds + } + + "Static-only processing (local files)" should "end when the processing is done" taggedAs (LocalTest) in { + val startTime = System.nanoTime() + val processor = new HybridProcessor(sc, sqlc, "file://src/test/resources/dummydata", Nil) + processor.start(rdd => rdd) + processor.awaitTermination() + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 10e9 // less than 10 seconds + } + + override def afterAll = { + sc.stop() + } +} + +class HDFSStreamSpec + extends FlatSpec + with ShouldMatchers + with BeforeAndAfterAll { + val sc = new SparkContext("local[3]", "HDFSStreamSpec") + val sqlc = new SQLContext(sc) + + "HybridProcessor" should "accept valid HDFS paths" taggedAs (LocalTest) in { + val hdfsPath = "hdfs:///tmp" + noException should be thrownBy { + new HybridProcessor(sc, sqlc, hdfsPath, Nil) + } + } + + it should "not accept invalid HDFS paths" taggedAs (LocalTest) in { + val hdfsPath = "hdfs:/abcd//tmp" + the[IllegalArgumentException] thrownBy { + new HybridProcessor(sc, sqlc, hdfsPath, Nil) + } should have message s"'$hdfsPath' is not a valid storage specification" + } + + "HDFS-only processing on a populated directory" should "end when the processing is done" taggedAs (HDFSTest) in { + val path = "hdfs:///user/fluentd/dummy" + val processor = new HybridProcessor(sc, sqlc, path, Nil) + val startTime = System.nanoTime() + val stopFun = processor.start(rdd => rdd)._1 + + processor.awaitTermination() + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 50e9 // less than 50 seconds + val (staticInfo, streamInfo) = stopFun() + // check number of items and received id + staticInfo.itemCount should be > 10000L + staticInfo.runtime should be > 0L + staticInfo.maxId should not be empty + streamInfo.itemCount shouldBe 0L + streamInfo.runtime shouldBe 0L + streamInfo.maxId shouldBe empty + } + + it should "be manually stoppable" taggedAs (HDFSTest) in { + val path = "hdfs:///user/fluentd/dummy" + val processor = new HybridProcessor(sc, sqlc, path, Nil) + val startTime = System.nanoTime() + val stopFun = processor.start(rdd => rdd)._1 + Thread.sleep(5000) // if we stop during the first batch, something goes wrong + val (staticInfo, streamInfo) = stopFun() + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 50e9 // less than 50 seconds + // check number of items and received id + staticInfo.itemCount should be > 10000L + staticInfo.runtime should be > 0L + staticInfo.maxId should not be empty + streamInfo.itemCount shouldBe 0L + streamInfo.runtime shouldBe 0L + streamInfo.maxId shouldBe empty + } + + "HDFS-only processing on an empty directory" should "not process anything" taggedAs (HDFSTest) in { + val path = "hdfs:///user/empty" + val processor = new HybridProcessor(sc, sqlc, path, Nil) + val stopFun = processor.start(rdd => rdd)._1 + processor.awaitTermination() + val (staticInfo, streamInfo) = stopFun() + // check number of items and received id + staticInfo.itemCount shouldBe 0L + staticInfo.runtime should be > 0L + staticInfo.maxId shouldBe empty + streamInfo.itemCount shouldBe 0L + streamInfo.runtime shouldBe 0L + streamInfo.maxId shouldBe empty + } + + override def afterAll = { + sc.stop() + } +} + +class KafkaStreamSpec + extends FlatSpec + with ShouldMatchers + with HasKafkaPath + with BeforeAndAfterAll { + val sc = new SparkContext("local[3]", "KafkaStreamSpec") + val sqlc = new SQLContext(sc) + + "HybridProcessor" should "accept valid Kafka paths" taggedAs (LocalTest) in { + val kafkaURI = s"kafka://$kafkaPath/dummy/1" + noException should be thrownBy { + new HybridProcessor(sc, sqlc, "empty", kafkaURI :: Nil) + } + } + + it should "not accept invalid Kafka paths" taggedAs (LocalTest) in { + val kafkaURI = s"kafka://$kafkaPath/dummy/1/300" + the[IllegalArgumentException] thrownBy { + new HybridProcessor(sc, sqlc, "file:///tmp", kafkaURI :: Nil) + } should have message s"'$kafkaURI' is not a valid stream specification" + } + + "Kafka-only processing on a populated topic" should "be manually stoppable before starting" taggedAs (KafkaTest) in { + val path = s"kafka://$kafkaPath/dummy/1" + val processor = new HybridProcessor(sc, sqlc, "empty", path :: Nil) + val startTime = System.nanoTime() + val stopFun = processor.start(rdd => rdd)._1 + Thread.sleep(1700) // if we stop during the first batch, something goes wrong + val (staticInfo, streamInfo) = stopFun() + processor.awaitTermination() + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 10e9 // less than 10 seconds + // check number of items and received id + staticInfo.itemCount shouldBe 0L + staticInfo.runtime should be > 0L + staticInfo.maxId shouldBe empty + streamInfo.itemCount shouldBe 0L + streamInfo.runtime shouldBe 0L + streamInfo.maxId shouldBe empty + } + + it should "be manually stoppable while running" taggedAs (KafkaTest) in { + val path = s"kafka://$kafkaPath/dummy/1" + val processor = new HybridProcessor(sc, sqlc, "empty", path :: Nil) + val startTime = System.nanoTime() + val stopFun = processor.start(rdd => rdd)._1 + Thread.sleep(10000) // if we stop during the first batch, something goes wrong + val (staticInfo, streamInfo) = stopFun() + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 10e10 // less than 100 seconds + // check number of items and received id + staticInfo.itemCount shouldBe 0L + staticInfo.runtime should be > 0L + staticInfo.maxId shouldBe empty + streamInfo.itemCount should be > 10000L + streamInfo.runtime should be > 0L + streamInfo.maxId should not be empty + } + + it should "be processable using SQL" taggedAs (KafkaTest) in { + val path = s"kafka://$kafkaPath/dummy/1" + val processor = new HybridProcessor(sc, sqlc, "empty", path :: Nil) + val startTime = System.nanoTime() + val schema = StructType(List(StructField("video_id", LongType, false), + StructField("title", StringType, false))) + import sqlc._ + val stopFun = processor.start(rdd => { + rdd.registerTempTable("test") + sql("SELECT video_id FROM test LIMIT 10") + }, Some(schema))._1 + Thread.sleep(15000) // if we stop during the first batch, something goes wrong + val (staticInfo, streamInfo) = stopFun() + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 40e9 // less than 40 seconds + // check number of items and received id + staticInfo.itemCount shouldBe 0L + staticInfo.runtime should be > 0L + staticInfo.maxId shouldBe empty + streamInfo.itemCount should be > 0L + streamInfo.itemCount should be < 100L + streamInfo.runtime should be > 0L + streamInfo.maxId should not be empty + } + + "Kafka-only processing on an empty topic" should "not process anything" taggedAs (KafkaTest) in { + Thread.sleep(2000) + val path = s"kafka://$kafkaPath/notopic/1" + val processor = new HybridProcessor(sc, sqlc, "empty", path :: Nil) + val stopFun = processor.start(rdd => rdd)._1 + Thread.sleep(10000) // if we stop during the first batch, something goes wrong + val (staticInfo, streamInfo) = stopFun() + // check number of items and received id + staticInfo.itemCount shouldBe 0L + staticInfo.runtime should be > 0L + staticInfo.maxId shouldBe empty + streamInfo.itemCount shouldBe 0L + streamInfo.runtime should be > 0L + streamInfo.maxId shouldBe empty + } + + override def afterAll = { + sc.stop() + } +} + +class HDFSKafkaStreamSpec + extends FlatSpec + with ShouldMatchers + with HasKafkaPath + with BeforeAndAfterAll { + val sc = new SparkContext("local[3]", "KafkaStreamSpec") + val sqlc = new SQLContext(sc) + + "HDFS+Kafka processing" should "change processing smoothly" taggedAs (HDFSTest, KafkaTest) in { + val hdfsPath = "hdfs:///user/fluentd/dummy" + val kafkaURI = s"kafka://$kafkaPath/dummy/1" + val processor = new HybridProcessor(sc, sqlc, hdfsPath, kafkaURI :: Nil) + val stopFun = processor.start(rdd => rdd)._1 + Thread.sleep(60000) + val (staticInfo, streamInfo) = stopFun() + // check number of items and received id + staticInfo.itemCount should be > 10000L + staticInfo.runtime should be > 0L + staticInfo.maxId should not be empty + streamInfo.itemCount should be > 1000L + streamInfo.runtime should be > 0L + streamInfo.maxId should not be empty + // we can't make a comparison such as "id x should be N larger than id y" + // with string ids, but we can check one is larger than the other + streamInfo.maxId.get should be > staticInfo.maxId.get + } + + override def afterAll = { + sc.stop() + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLParserSpec.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLParserSpec.scala new file mode 100644 index 0000000..d476e5a --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLParserSpec.scala @@ -0,0 +1,128 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.scalatest.FlatSpec +import org.scalatest.Matchers._ + +/* This test case tests only that the parser recognizes the syntax + * that was defined for JubaQL correctly. It does not test the + * functionality that was defined for each statement. + */ +class JubaQLParserSpec extends FlatSpec { + // TODO write more CREATE DATASOURCE tests + + "A JubaQLParser" should "recognize CREATE DATASOURCE without schema" taggedAs (LocalTest) in { + val parser = new JubaQLParser + val result: Option[JubaQLAST] = parser.parse( + """ + CREATE DATASOURCE test1 + FROM (STORAGE: "hdfs://hello", STREAM: "fluentd://f1", STREAM: "fluentd://f2") + """.stripMargin + ) + + result shouldNot be(None) + val ds = result.get.asInstanceOf[CreateDatasource] + ds.sourceName shouldBe "test1" + ds.columns shouldBe List() + ds.sinkStorage shouldBe "hdfs://hello" + ds.sinkStreams shouldBe List("fluentd://f1", "fluentd://f2") + } + + "A JubaQLParser" should "recognize CREATE DATASOURCE" taggedAs (LocalTest) in { + val parser = new JubaQLParser + val result: Option[JubaQLAST] = parser.parse( + """ + CREATE DATASOURCE test1 (column_type1 string, column_type2 numeric, column_type3 boolean) + FROM (STORAGE: "hdfs://hello", STREAM: "fluentd://f1", STREAM: "fluentd://f2") + """.stripMargin + ) + + result shouldNot be(None) + val ds = result.get.asInstanceOf[CreateDatasource] + ds.sourceName shouldBe "test1" + ds.columns shouldBe List(("column_type1", "string"), ("column_type2", "numeric"), ("column_type3", "boolean")) + ds.sinkStorage shouldBe "hdfs://hello" + ds.sinkStreams shouldBe List("fluentd://f1", "fluentd://f2") + } + + // TODO write more CREATE MODEL tests + + it should "recognize CREATE MODEL" taggedAs (LocalTest) in { + val parser = new JubaQLParser + // use single quotation + val result: Option[JubaQLAST] = parser.parse( + """ + CREATE ANOMALY MODEL test1 WITH(id: "id", datum: ["a", "b"]) config = '{"test": 123}' + """.stripMargin + ) + + result shouldNot be(None) + val create = result.get.asInstanceOf[CreateModel] + create.algorithm shouldBe "ANOMALY" + create.modelName shouldBe "test1" + create.configJson shouldBe "{\"test\": 123}" + create.specifier shouldBe List(("id", List("id")), ("datum", List("a", "b"))) + } + + // TODO write more UPDATE tests + + it should "recognize UPDATE" taggedAs (LocalTest) in { + val parser = new JubaQLParser + val result: Option[JubaQLAST] = parser.parse( + """ + UPDATE MODEL juba_model USING train FROM source + """.stripMargin + ) + + result shouldNot be(None) + val update = result.get.asInstanceOf[Update] + update.modelName shouldBe "juba_model" + update.rpcName shouldBe "train" + update.source shouldBe "source" + } + + // TODO write more ANALYZE tests + + it should "recognize ANALYZE" taggedAs (LocalTest) in { + val parser = new JubaQLParser + val result: Option[JubaQLAST] = parser.parse( + """ + ANALYZE '{"test": 123}' BY MODEL juba_model USING calc_score + """.stripMargin + ) + + result shouldNot be(None) + val analyze = result.get.asInstanceOf[Analyze] + analyze.modelName shouldBe "juba_model" + analyze.rpcName shouldBe "calc_score" + analyze.data shouldBe "{\"test\": 123}" + } + + it should "recognize SHUTDOWN" taggedAs (LocalTest) in { + val parser = new JubaQLParser + val result: Option[JubaQLAST] = parser.parse("SHUTDOWN") + result should not be empty + result.get shouldBe a[Shutdown] + } + + it should "recognize STOP PROCESSING" taggedAs (LocalTest) in { + val parser = new JubaQLParser + val result: Option[JubaQLAST] = parser.parse("STOP PROCESSING") + result should not be empty + result.get shouldBe a[StopProcessing] + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLProcessorSpec.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLProcessorSpec.scala new file mode 100644 index 0000000..5b6fa32 --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLProcessorSpec.scala @@ -0,0 +1,564 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import scala.sys.process._ +import scala.io.Source +import org.scalatest._ +import java.nio.file.{Paths, Files} +import dispatch._ +import dispatch.Defaults._ +import org.json4s._ +import org.json4s.JsonDSL._ +import org.json4s.native.JsonMethods._ +import scala.util.{Success, Failure, Try} +import us.jubat.jubaql_server.processor.json.ClassifierResult + +/** Tests the correct behavior as viewed from the outside. + */ +class JubaQLProcessorSpec + extends FlatSpec + with Matchers + with HasKafkaPath + with BeforeAndAfter + with BeforeAndAfterAll { + + implicit val formats = DefaultFormats + + var process: Process = null + var stdout: StringBuffer = null + var sendJubaQL: String => Try[(Int, JValue)] = null + + before { + val startResult = startProcessor() + process = startResult._1 + stdout = startResult._2 + sendJubaQL = startResult._3 + } + + after { + // all tests should send SHUTDOWN for cleanup, this is just + // a fallback to avoid zombie processes "in case of" + process.destroy() + } + + val goodCdStmt = """CREATE DATASOURCE ds1 (label string, name string) FROM (STORAGE: "file://src/test/resources/shogun_data.json")""" + val goodCmStmt = """CREATE CLASSIFIER MODEL test1 WITH (label: "label", datum: "name") config = '{"method": "AROW","converter": { "num_filter_types": {}, "num_filter_rules": [], "string_filter_types": {}, "string_filter_rules": [], "num_types": {}, "num_rules": [],"string_types": {"unigram": { "method": "ngram", "char_num": "1" }},"string_rules": [{ "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } ]},"parameter": {"regularization_weight" : 1.0}}'""" + val goodUmStmt = """UPDATE MODEL test1 USING train FROM ds1""" + val goodAStmt = """ANALYZE '{"name": "慶喜"}' BY MODEL test1 USING classify""" + + + "CREATE DATASOURCE" should "return HTTP 200 on correct syntax" taggedAs (LocalTest) in { + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + cdResult.get._1 shouldBe 200 + cdResult.get._2 \ "result" shouldBe JString("CREATE DATASOURCE") + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "return HTTP 500 on bad syntax" taggedAs (LocalTest) in { + // TODO no it shouldn't (400 is better) + // the statement below is bad because we don't know the protocol hdddddfs + val badCdStmt = """CREATE DATASOURCE ds1 (label string, name string) FROM (STORAGE: "hdddddfs:///jubatus-on-yarn/sample/shogun_data.json")""" + val cdResult = sendJubaQL(badCdStmt) + cdResult shouldBe a[Success[_]] + cdResult.get._1 shouldBe 500 + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + + "CREATE MODEL" should "return HTTP 200 on correct syntax" taggedAs (LocalTest, JubatusTest) in { + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + cmResult.get._1 shouldBe 200 + cmResult.get._2 \ "result" shouldBe JString("CREATE MODEL (started)") + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + // TODO ignored because server currently ignores the bad syntax + it should "return HTTP 500 on bad syntax" taggedAs (LocalTest, JubatusTest) ignore { + // TODO no it shouldn't (400 is better) + // the statement below is bad because "hello" is not a valid keyword + val badCmStmt = """CREATE CLASSIFIER MODEL test1 WITH(hello: "label", datum: "name") config = '{"method": "AROW","converter": { "num_filter_types": {}, "num_filter_rules": [], "string_filter_types": {}, "string_filter_rules": [], "num_types": {}, "num_rules": [],"string_types": {"unigram": { "method": "ngram", "char_num": "1" }},"string_rules": [{ "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } ]},"parameter": {"regularization_weight" : 1.0}}'""" + val cmResult = sendJubaQL(badCmStmt) + cmResult shouldBe a[Success[_]] + cmResult.get._1 shouldBe 500 + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + + "UPDATE MODEL" should "return HTTP 200 when model and datasource are present" taggedAs (LocalTest, JubatusTest) in { + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + // start updating + val umResult = sendJubaQL(goodUmStmt) + umResult shouldBe a[Success[_]] + umResult.get._1 shouldBe 200 + umResult.get._2 \ "result" shouldBe JString("UPDATE MODEL") + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "return HTTP 500 when model is missing" taggedAs (LocalTest) in { + // TODO no it shouldn't (400 is better) + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + // start updating + val umResult = sendJubaQL(goodUmStmt) + umResult shouldBe a[Success[_]] + umResult.get._1 shouldBe 500 + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "return HTTP 500 when data source is missing" taggedAs (LocalTest, JubatusTest) in { + // TODO no it shouldn't (400 is better) + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + // start updating + val umResult = sendJubaQL(goodUmStmt) + umResult shouldBe a[Success[_]] + umResult.get._1 shouldBe 500 + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + + "ANALYZE" should "return with a meaningful result when UPDATE was run" taggedAs (LocalTest, JubatusTest) in { + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + val umResult = sendJubaQL(goodUmStmt) + umResult shouldBe a[Success[_]] + // query + Thread.sleep(1000) + val aResult = sendJubaQL(goodAStmt) + aResult shouldBe a[Success[_]] + aResult.get._1 shouldBe 200 + aResult.get._2 \ "result" \ "predictions" shouldBe a[JArray] + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "return with a meaningful result after STOP PROCESSING" taggedAs (LocalTest, JubatusTest) in { + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + val umResult = sendJubaQL(goodUmStmt) + umResult shouldBe a[Success[_]] + Thread.sleep(1000) + val spResult = sendJubaQL("STOP PROCESSING") + spResult shouldBe a[Success[_]] + // query + val aResult = sendJubaQL(goodAStmt) + aResult shouldBe a[Success[_]] + aResult.get._1 shouldBe 200 + aResult.get._2 \ "result" \ "predictions" shouldBe a[JArray] + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "work correctly with CLASSIFIER" taggedAs (LocalTest, JubatusTest) in { + val cmStmt = """CREATE DATASOURCE ds (label string, name string) FROM (STORAGE: "file://src/test/resources/shogun_data.json")""" + val cmResult = sendJubaQL(cmStmt) + cmResult shouldBe a[Success[_]] + + val config = Source.fromFile("src/test/resources/shogun.json").getLines().mkString("") + val cdStmt = s"""CREATE CLASSIFIER MODEL test WITH (label: "label", datum: "name") config = '$config'""" + val cdResult = sendJubaQL(cdStmt) + cdResult shouldBe a[Success[_]] + + val umStmt = """UPDATE MODEL test USING train FROM ds""" + val umResult = sendJubaQL(umStmt) + umResult shouldBe a[Success[_]] + Thread.sleep(2500) + + // analyze + val aStmt = """ANALYZE '{"name": "慶喜"}' BY MODEL test USING classify""" + val aResult = sendJubaQL(aStmt) + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // now check the result + aResult shouldBe a[Success[_]] + if (aResult.get._1 != 200) + println(stdout.toString) + aResult.get._1 shouldBe 200 + (aResult.get._2 \ "result").extractOpt[ClassifierResult] match { + case Some(pred) => + val scores = pred.predictions.map(res => (res.label, res.score)).toMap + // the order of entries differs per machine/OS, so we use this + // slightly complicated way of checking equality + scores.keys.toList should contain only("徳川", "足利", "北条") + Math.abs(scores("徳川") - 0.07692306488752365) should be < 0.00001 + scores("足利") shouldBe 0.0 + scores("北条") shouldBe 0.0 + case None => + fail("Failed to parse returned content as a classifier result") + } + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "work correctly with ANOMALY" taggedAs (LocalTest, JubatusTest) in { + val cmStmt = """CREATE DATASOURCE ds (label string, name string) FROM (STORAGE: "file://src/test/resources/shogun_data.json")""" + val cmResult = sendJubaQL(cmStmt) + cmResult shouldBe a[Success[_]] + + val config = Source.fromFile("src/test/resources/lof.json").getLines().mkString("") + val cdStmt = s"""CREATE ANOMALY MODEL test WITH (label: "label", datum: "name") config = '$config'""" + val cdResult = sendJubaQL(cdStmt) + cdResult shouldBe a[Success[_]] + + val umStmt = """UPDATE MODEL test USING add FROM ds""" + val umResult = sendJubaQL(umStmt) + umResult shouldBe a[Success[_]] + Thread.sleep(2500) + + // analyze + val aStmt = """ANALYZE '{"name": "慶喜"}' BY MODEL test USING calc_score""" + val aResult = sendJubaQL(aStmt) + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // now check the result + aResult shouldBe a[Success[_]] + if (aResult.get._1 != 200) + println(stdout.toString) + aResult.get._1 shouldBe 200 + aResult.get._2 \ "result" \ "score" match { + case JDouble(score) => { + // the result of calc_score seems to differ slightly between + // machines/OSes, therefore we do a proximity check instead + // of equality comparison + Math.abs(score - 1.006646) should be < 0.0005 + } + case _ => + fail("Failed to parse returned content as an anomaly result") + } + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "work correctly with RECOMMENDER/from_id" taggedAs (LocalTest, JubatusTest) in { + val cmStmt = """CREATE DATASOURCE ds FROM (STORAGE: "file://src/test/resources/npb_similar_player_data.json")""" + val cmResult = sendJubaQL(cmStmt) + cmResult shouldBe a[Success[_]] + + val config = Source.fromFile("src/test/resources/npb_similar_player.json").getLines().mkString("") + val cdStmt = s"""CREATE RECOMMENDER MODEL test WITH (id: "id", datum: ["team", "打率", "試合数", "打席", "打数", "安打", "本塁打", "打点", "盗塁", "四球", "死球", "三振", "犠打", "併殺打", "長打率", "出塁率", "OPS", "RC27", "XR27"]) config = '$config'""" + val cdResult = sendJubaQL(cdStmt) + cdResult shouldBe a[Success[_]] + + val umStmt = """UPDATE MODEL test USING update_row FROM ds""" + val umResult = sendJubaQL(umStmt) + umResult shouldBe a[Success[_]] + Thread.sleep(2500) + + // analyze + val aStmt = """ANALYZE 'スレッジ' BY MODEL test USING complete_row_from_id""" + val aResult = sendJubaQL(aStmt) + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // now check the result + aResult shouldBe a[Success[_]] + if (aResult.get._1 != 200) + println(stdout.toString) + aResult.get._1 shouldBe 200 + aResult.get._2 \ "result" \ "num_values" match { + case JObject(list) => + val vals = list.collect({ + case (s, JDouble(j)) => (s, j) + }).toMap + Math.abs(vals("長打率") - 0.3539453148841858) should be < 0.00001 + Math.abs(vals("試合数") - 104.234375) should be < 0.00001 + Math.abs(vals("打数") - 331.5546875) should be < 0.00001 + case _ => + fail("there was no 'num_values' key") + } + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "work correctly with RECOMMENDER/from_data" taggedAs (LocalTest, JubatusTest) in { + val cmStmt = """CREATE DATASOURCE ds FROM (STORAGE: "file://src/test/resources/npb_similar_player_data.json")""" + val cmResult = sendJubaQL(cmStmt) + cmResult shouldBe a[Success[_]] + + val config = Source.fromFile("src/test/resources/npb_similar_player.json").getLines().mkString("") + val cdStmt = s"""CREATE RECOMMENDER MODEL test WITH (id: "id", datum: ["team", "打率", "試合数", "打席", "打数", "安打", "本塁打", "打点", "盗塁", "四球", "死球", "三振", "犠打", "併殺打", "長打率", "出塁率", "OPS", "RC27", "XR27"]) config = '$config'""" + val cdResult = sendJubaQL(cdStmt) + cdResult shouldBe a[Success[_]] + + val umStmt = """UPDATE MODEL test USING update_row FROM ds""" + val umResult = sendJubaQL(umStmt) + umResult shouldBe a[Success[_]] + Thread.sleep(2500) + + // analyze + val aStmt = """ANALYZE '{"team":"巨人","打率":0.209,"試合数":65.0,"打席":149.0,"打数":129.0,"安打":27.0,"本塁打":0.0,"打点":8.0,"盗塁":2.0,"四球":12.0,"死球":0.0,"三振":28.0,"犠打":6.0,"併殺打":5.0,"長打率":0.256,"出塁率":0.273,"OPS":0.529,"RC27":1.96,"XR27":2.07}' BY MODEL test USING complete_row_from_datum""" + val aResult = sendJubaQL(aStmt) + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // now check the result + aResult shouldBe a[Success[_]] + if (aResult.get._1 != 200) + println(stdout.toString) + aResult.get._1 shouldBe 200 + aResult.get._2 \ "result" \ "num_values" match { + case JObject(list) => + val vals = list.collect({ + case (s, JDouble(j)) => (s, j) + }).toMap + Math.abs(vals("長打率") - 0.33874213695526123) should be < 0.00001 + Math.abs(vals("試合数") - 100.953125) should be < 0.00001 + Math.abs(vals("打数") - 307.8046875) should be < 0.00001 + case _ => + fail("there was no 'num_values' key") + } + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "still return HTTP 200 when UPDATE was not run" taggedAs (LocalTest, JubatusTest) in { + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + // query + val aResult = sendJubaQL(goodAStmt) + aResult shouldBe a[Success[_]] + aResult.get._1 shouldBe 200 + aResult.get._2 \ "result" \ "predictions" shouldBe a[JArray] + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "return HTTP 500 on bad syntax" taggedAs (LocalTest, JubatusTest) in { + // TODO no it shouldn't (400 is better) + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + // the below statement is bad because it references a nonexisting algorithm + val aResult = sendJubaQL( """ANALYZE '{"name": "慶喜"}' BY MODEL test1 USING aNonExistingAlgorithm""") + aResult shouldBe a[Success[_]] + aResult.get._1 shouldBe 500 + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + it should "return HTTP 500 when there is no model" taggedAs (LocalTest) in { + // TODO no it shouldn't (400 is better) + val cdResult = sendJubaQL(goodCdStmt) + cdResult shouldBe a[Success[_]] + val aResult = sendJubaQL(goodAStmt) + aResult shouldBe a[Success[_]] + aResult.get._1 shouldBe 500 + // shut down + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + } + + + "SHUTDOWN" should "stop the running instance" taggedAs (LocalTest) in { + // send only SHUTDOWN + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + sdResult.get._1 shouldBe 200 + sdResult.get._2 \ "result" shouldBe a[JString] + (sdResult.get._2 \ "result").asInstanceOf[JString].values should startWith("SHUTDOWN") + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + stdout.toString should include("shut down successfully") + } + + it should "stop even after datasource was created" taggedAs (LocalTest) in { + // set up a data source + val cmResult = sendJubaQL(goodCdStmt) + cmResult shouldBe a[Success[_]] + // send only SHUTDOWN + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + sdResult.get._1 shouldBe 200 + sdResult.get._2 \ "result" shouldBe a[JString] + (sdResult.get._2 \ "result").asInstanceOf[JString].values should startWith("SHUTDOWN") + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + stdout.toString should include("shut down successfully") + } + + it should "stop even after model was created" taggedAs (LocalTest, JubatusTest) in { + // set up a data source + val cmResult = sendJubaQL(goodCmStmt) + cmResult shouldBe a[Success[_]] + // send only SHUTDOWN + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + sdResult.get._1 shouldBe 200 + sdResult.get._2 \ "result" shouldBe a[JString] + (sdResult.get._2 \ "result").asInstanceOf[JString].values should startWith("SHUTDOWN") + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + stdout.toString should include("shut down successfully") + } + + it should "stop within a moderate time even when data is processed" taggedAs (HDFSTest, JubatusTest) in { + val cmResult = sendJubaQL( """CREATE CLASSIFIER MODEL test1 WITH(label: "movie_type", datum: ["title", "description"]) config = '{"method": "AROW","converter": { "num_filter_types": {}, "num_filter_rules": [], "string_filter_types": {}, "string_filter_rules": [], "num_types": {}, "num_rules": [],"string_types": {"unigram": { "method": "ngram", "char_num": "1" }},"string_rules": [{ "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } ]},"parameter": {"regularization_weight" : 1.0}}'""") + cmResult shouldBe a[Success[_]] + val cdResult = sendJubaQL( s"""CREATE DATASOURCE ds1 (movie_type string, title string, description string) FROM (STORAGE: "hdfs:///user/fluentd/dummy", STREAM: "kafka://$kafkaPath/dummy/1")""") + cdResult shouldBe a[Success[_]] + // start updating + val umResult = sendJubaQL( """UPDATE MODEL test1 USING train FROM ds1""") + umResult shouldBe a[Success[_]] + Thread.sleep(5000) + // shut down + val startTime = System.nanoTime() + val sdResult = sendJubaQL("SHUTDOWN") + sdResult shouldBe a[Success[_]] + sdResult.get._2 \ "result" shouldBe a[JString] + (sdResult.get._2 \ "result").asInstanceOf[JString].values should startWith("SHUTDOWN") + // wait until shutdown + val exitValue = process.exitValue() + exitValue shouldBe 0 + val executionTime = (System.nanoTime() - startTime) + executionTime.toDouble should be < 25e9 // less than 25 seconds + } + + + override protected def beforeAll(): Unit = { + // if there is no script to start the application yet, generate it + if (!Files.exists(Paths.get("start-script/run"))) { + Seq("sbt", "start-script").! + } + super.beforeAll() + } + + protected def startProcessor(): (Process, + StringBuffer, String => Try[(Int, JValue)]) = { + val command = Seq("./start-script/run") + val (logger, stdoutBuffer, stderrBuffer) = getProcessLogger() + val process = command run logger + val port = getServerPort(stdoutBuffer) + (process, stdoutBuffer, sendJubaQLTo(port)) + } + + protected def getProcessLogger(): (ProcessLogger, StringBuffer, StringBuffer) = { + val stdoutBuffer = new StringBuffer() + val stderrBuffer = new StringBuffer() + val logger = ProcessLogger(line => { + stdoutBuffer append line + stdoutBuffer append "\n" + }, + line => { + stderrBuffer append line + stderrBuffer append "\n" + }) + (logger, stdoutBuffer, stderrBuffer) + } + + protected def getServerPort(stdout: StringBuffer): Int = { + val portRe = "(?s).+listening on port ([0-9]+)\n".r + var port = 0 + while (port == 0) { + stdout.toString match { + case portRe(loggedPort) => + port = loggedPort.toInt + case _ => + Thread.sleep(100) + } + } + port + } + + protected def sendJubaQLTo(port: Int)(stmt: String): Try[(Int, JValue)] = { + val url = :/("localhost", port) / "jubaql" + val body = compact(render("query" -> stmt)) + Http(url << body).either.apply() match { + case Left(error) => + Failure(error) + case Right(response) => + Try { + (response.getStatusCode, + parse(response.getResponseBody("UTF-8"))) + } + } + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLServiceHelperSpec.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLServiceHelperSpec.scala new file mode 100644 index 0000000..4525dc8 --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLServiceHelperSpec.scala @@ -0,0 +1,92 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.scalatest.{ShouldMatchers, BeforeAndAfterAll, FlatSpec} +import org.apache.spark.SparkContext + +/* This test case tests only the state-independent (helper) functions of + * JubaQLService (such as `parseJson()` or `extractDatum()`). It does + * not test interaction with external components or anything that + * requires state change. + * (The reason being that we need to kill the JVM that is running + * the JubaQLProcessor to exit cleanly.) + */ +class JubaQLServiceHelperSpec extends FlatSpec with ShouldMatchers with BeforeAndAfterAll { + private var sc: SparkContext = null + private var service: JubaQLServiceTester = null + + // create a subclass to test the protected methods + class JubaQLServiceTester(sc: SparkContext) extends JubaQLService(sc, RunMode.Development) { + override def parseJson(in: String): Option[JubaQLAST] = + super.parseJson(in) + } + + "parseJson()" should "be able to parse JSON" taggedAs (LocalTest) in { + val query = """ + CREATE DATASOURCE test1 (column_type1 string, column_type2 numeric, column_type3 boolean) + FROM (STORAGE: "hdfs://hello", STREAM: "fluentd://f1", STREAM: "fluentd://f2") + """.stripMargin.trim + val json = """{"query": "%s"}""".format(query.replace("\"", "\\\"")) + val result = service.parseJson(json) + result should not be empty + result.get shouldBe a[CreateDatasource] + } + + it should "be able to parse JSON with additional fields" taggedAs (LocalTest) in { + val query = """ + CREATE DATASOURCE test1 (column_type1 string, column_type2 numeric, column_type3 boolean) + FROM (STORAGE: "hdfs://hello", STREAM: "fluentd://f1", STREAM: "fluentd://f2") + """.stripMargin.trim + val json = """{"session_id": "test", "query": "%s"}""".format(query.replace("\"", "\\\"")) + val result = service.parseJson(json) + result should not be empty + result.get shouldBe a[CreateDatasource] + } + + it should "yield None if the JSON contains a bogus query" taggedAs (LocalTest) in { + val json = """{"query": "test"}""" + val result = service.parseJson(json) + result shouldBe empty + } + + it should "yield None if the JSON contains a non-string query" taggedAs (LocalTest) in { + val json = """{"query": 27}""" + val result = service.parseJson(json) + result shouldBe empty + } + + it should "yield None if the JSON contains no query" taggedAs (LocalTest) in { + val json = """{"foo": "bar"}""" + val result = service.parseJson(json) + result shouldBe empty + } + + it should "yield None if the string is no JSON" taggedAs (LocalTest) in { + val json = """{"foo": "bar"}""" + val result = service.parseJson(json) + result shouldBe empty + } + + override protected def beforeAll(): Unit = { + sc = new SparkContext("local[3]", "JubaQL Processor Test") + service = new JubaQLServiceTester(sc) + } + + override protected def afterAll(): Unit = { + sc.stop() + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/LocalJubatusApplicationSpec.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/LocalJubatusApplicationSpec.scala new file mode 100644 index 0000000..52b027b --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/LocalJubatusApplicationSpec.scala @@ -0,0 +1,159 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.scalatest._ +import us.jubat.yarn.common.LearningMachineType +import scala.concurrent._ +import ExecutionContext.Implicits.global +import scala.concurrent.duration.Duration +import scala.util.Success + +class LocalJubatusApplicationSpec extends FlatSpec with ShouldMatchers { + + val anomalyConfig = """{ + "method" : "lof", + "parameter" : { + "nearest_neighbor_num" : 10, + "reverse_nearest_neighbor_num" : 30, + "method" : "euclid_lsh", + "parameter" : { + "hash_num" : 64, + "table_num" : 4, + "seed" : 1091, + "probe_num" : 64, + "bin_width" : 100, + "retain_projection" : false + } + }, + "converter" : { + "string_filter_types" : {}, + "string_filter_rules" : [], + "num_filter_types" : {}, + "num_filter_rules" : [], + "string_types" : {}, + "string_rules" : [ + { "key" : "*", "type" : "str", "sample_weight" : "bin", "global_weight" : "bin" } + ], + "num_types" : {}, + "num_rules" : [ + { "key" : "*", "type" : "num" } + ] + } +}""" + + val classifierConfig = """{ + "method" : "AROW", + "parameter" : { + "regularization_weight" : 1.0 + }, + "converter" : { + "string_filter_types" : {}, + "string_filter_rules" : [], + "num_filter_types" : {}, + "num_filter_rules" : [], + "string_types" : {}, + "string_rules" : [ + { "key" : "*", "type" : "str", "sample_weight" : "bin", "global_weight" : "bin" } + ], + "num_types" : {}, + "num_rules" : [ + { "key" : "*", "type" : "num" } + ] + } +}""" + + val recommenderConfig = """{ + "method": "lsh", + "parameter" : { + "hash_num" : 64 + }, + "converter" : { + "string_filter_types": {}, + "string_filter_rules":[], + "num_filter_types": {}, + "num_filter_rules": [], + "string_types": {}, + "string_rules":[ + {"key" : "*", "type" : "str", "sample_weight":"bin", "global_weight" : "bin"} + ], + "num_types": {}, + "num_rules": [ + {"key" : "*", "type" : "num"} + ] + } +}""" + + "mkfifo" should "make a named pipe" taggedAs (LocalTest) in { + import java.io._ + + val namedPipePath = "/tmp/abcdefghijklmnopqrstuvwxyz" + val namedPipe = new File(namedPipePath) + try { + val runtime = Runtime.getRuntime + namedPipe.exists shouldBe false + LocalJubatusApplication.mkfifo(namedPipePath, runtime) + namedPipe.exists shouldBe true + + val process = runtime.exec(s"file $namedPipePath") + val in = process.getInputStream + val br = new BufferedReader(new InputStreamReader(in)) + val line = br.readLine + line should not be null + // The following code depends on file command, + // so may fail on some environment. + line.contains("named pipe") shouldBe true + } finally { + namedPipe.delete() + } + } + + "jubaanomaly" should "start" taggedAs (LocalTest, JubatusTest) in { + val f = LocalJubatusApplication.start("foo", LearningMachineType.Anomaly, anomalyConfig) + Await.ready(f, Duration.Inf) + val result = f.value.get + result shouldBe a[Success[_]] + result match { + case Success(app) => + Await.ready(app.stop(), Duration.Inf) + case _ => + } + } + + "jubaclassifier" should "start" taggedAs (LocalTest, JubatusTest) in { + val f = LocalJubatusApplication.start("bar", LearningMachineType.Classifier, classifierConfig) + Await.ready(f, Duration.Inf) + val result = f.value.get + result shouldBe a[Success[_]] + result match { + case Success(app) => + Await.ready(app.stop(), Duration.Inf) + case _ => + } + } + + "jubarecommender" should "start" taggedAs (LocalTest, JubatusTest) in { + val f = LocalJubatusApplication.start("baz", LearningMachineType.Recommender, recommenderConfig) + Await.ready(f, Duration.Inf) + val result = f.value.get + result shouldBe a[Success[_]] + result match { + case Success(app) => + Await.ready(app.stop(), Duration.Inf) + case _ => + } + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/MockServer.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/MockServer.scala new file mode 100644 index 0000000..e72043b --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/MockServer.scala @@ -0,0 +1,36 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.scalatest.{Suite, BeforeAndAfterAll} +import unfiltered.util.RunnableServer + +/** Creates a mock HTTP server on localhost:9877 to test HTTP clients. + */ +trait MockServer extends BeforeAndAfterAll { + this: Suite => + + // this needs to be overridden to define server behavior + protected val server: RunnableServer + + override protected def beforeAll() = { + server.start() + } + + override protected def afterAll(): Unit = { + server.stop() + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/RegistrationSpec.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/RegistrationSpec.scala new file mode 100644 index 0000000..dac54ef --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/RegistrationSpec.scala @@ -0,0 +1,189 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import scala.sys.process._ +import scala.collection.mutable +import org.scalatest._ +import java.nio.file.{Paths, Files} +import unfiltered.response._ +import unfiltered.util.RunnableServer +import unfiltered.request._ +import org.json4s.JsonAST.{JString, JInt, JValue} + +/** Tests the correct behavior with respect to registration at the gateway. + * + * We want to run the application with parameters in a separate process + * and capture the output and return code. We use the sbt-start-script + * plugin to run the application, + * since running with `sbt run` leads to significant overhead and makes + * dealing with output complicated. + */ +class RegistrationSpec extends FlatSpec with Matchers with MockServer { + + override protected def beforeAll(): Unit = { + // if there is no script to start the application yet, generate it + if (!Files.exists(Paths.get("start-script/run"))) { + Seq("sbt", "start-script").! + } + super.beforeAll() + } + + /** + * Returns a ProcessLogger logging stdout/stderr to two StringBuffers. + */ + protected def getProcessLogger(): (ProcessLogger, StringBuffer, StringBuffer) = { + val stdoutBuffer = new StringBuffer() + val stderrBuffer = new StringBuffer() + val logger = ProcessLogger(line => { + stdoutBuffer append line + stdoutBuffer append "\n" + }, + line => { + stderrBuffer append line + stderrBuffer append "\n" + }) + (logger, stdoutBuffer, stderrBuffer) + } + + // First, check for invalid input + + "Passing an invalid string as URL" should "print an error and exit" taggedAs (LocalTest) in { + val command = Seq("./start-script/run", "xyz") + val (logger, stdoutBuffer, stderrBuffer) = getProcessLogger() + val exitCode = command ! logger + // check exit code and console output + exitCode shouldBe 1 + stdoutBuffer.toString should include("invalid URL provided") + } + + "Passing an URL of a non-existing server" should "print an error and exit" in { + // start the client and specify a server that does (probably) not exist + val command = Seq("./start-script/run", "http://lameiq2elakliajdlawkidl.jp/hoge") + val (logger, stdoutBuffer, stderrBuffer) = getProcessLogger() + val exitCode = command ! logger + // check exit code and console output + exitCode shouldBe 1 + stdoutBuffer.toString should include("registration failed: " + + "java.net.ConnectException") + } + + // Now, check for valid input + + // received JSON will be stored in those maps for later inspection + val registerData: mutable.SynchronizedMap[String, JValue] = + new mutable.HashMap[String, JValue]() with mutable.SynchronizedMap[String, JValue] + val unregisterData: mutable.SynchronizedMap[String, JValue] = + new mutable.HashMap[String, JValue]() with mutable.SynchronizedMap[String, JValue] + + // this server mocks the gateway + protected val server: RunnableServer = { + unfiltered.netty.Server.http(9877).plan( + // define the server behavior + unfiltered.netty.cycle.Planify { + // return some result if a query is given with valid json + case req@POST(Path(Seg("test" :: testId :: Nil))) => + val body = JsonBody(req) + body match { + case Some(bodyData) => + val JString(action) = bodyData \ "action" + // store the received JValue in a map for examination + if (action == "register") { + registerData += (testId -> bodyData) + Ok ~> ResponseString("") + } else if (action == "unregister") { + unregisterData += (testId -> bodyData) + Ok ~> ResponseString("") + } else { + BadRequest ~> ResponseString("error") + } + case None => + InternalServerError ~> ResponseString("error") + } + case _ => + NotFound ~> ResponseString("404") + }) + } + + "Passing the URL of a gateway-like server" should "register there" taggedAs (LocalTest) in { + val command = Seq("./start-script/run", "http://localhost:9877/test/reg1") + val (logger, stdoutBuffer, stderrBuffer) = getProcessLogger() + // run the command + val process = command run logger + var submittedJson: Option[JValue] = None + var waitedTime = 0 + // wait until we receive data or reach timeout + while (submittedJson.isEmpty && waitedTime < 20000) { + submittedJson = registerData.get("reg1") + Thread.sleep(200) + waitedTime += 200 + } + val exitCode = killSpawnedProgram(process) + // check exit code and console output + exitCode shouldBe 0 + stdoutBuffer.toString should include(" registered successfully") + stdoutBuffer.toString should include("JubaQLProcessor shut down successfully\n") + // check the JSON sent for registration purposes + submittedJson should not be empty + val json = submittedJson.get + (json \ "ip") shouldBe a[JString] + (json \ "port").getClass shouldBe JInt(1).getClass + } + + it should "unregister after receiving SIGTERM" taggedAs (LocalTest) in { + val command = Seq("./start-script/run", "http://localhost:9877/test/unreg1") + val (logger, stdoutBuffer, stderrBuffer) = getProcessLogger() + // run the command + val process = command run logger + // wait until registration is complete + var waitedTime = 0 + while (registerData.get("unreg1").isEmpty && waitedTime < 20000) { + Thread.sleep(200) + waitedTime += 200 + } + val exitCode = killSpawnedProgram(process) + // check exit code and console output + exitCode shouldBe 0 + stdoutBuffer.toString should include("unregistered successfully") + stdoutBuffer.toString should include("JubaQLProcessor shut down successfully\n") + // check the JSON sent for registration purposes + val submittedJson = unregisterData.get("unreg1") + submittedJson should not be empty + } + + private def killSpawnedProgram(process: Process): Int = { + // get the PID of the currently running program by means of `ps` parsing + val programs = ("ps x" #| "grep java").!! + // on linux, ps x output looks like: "13239 ? Sl 0:16 java [...]" + // on Mac OS, it looks like: "39905 s001 S+ 0:08.50 /usr/bin/java [...]" + val psRe = """ *([0-9]+) .+?:[0-9.]+ (.+)""".r + // TODO is there a way to get child processes using pstree etc.? + val spawnedProcesses = programs.split('\n').toList.collect { + // parse `ps x` output + case psRe(pid, cmd) => (pid, cmd) + }.filter { + // try to find the program that we spawned + case (pid, cmd) => cmd.contains("processor/") && cmd.contains("/target/scala-2.10/classes:") + } + // send a SIGTERM to end the program + spawnedProcesses.foreach { + case (pid, cmd) => + s"kill $pid".!! + } + // now wait for exit + process.exitValue() + } +} diff --git a/processor/src/test/scala/us/jubat/jubaql_server/processor/TestTags.scala b/processor/src/test/scala/us/jubat/jubaql_server/processor/TestTags.scala new file mode 100644 index 0000000..2e918ef --- /dev/null +++ b/processor/src/test/scala/us/jubat/jubaql_server/processor/TestTags.scala @@ -0,0 +1,40 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.scalatest.Tag + +/* + * These are tags for tests so that, for example, only tests that require no + * network connection can be run. + * When testing from sbt, use + * > testOnly us.jubat.jubaql_server.processor.* -- -l jubaql.HDFSTest + * to exclude HDFSTest-tagged tests from the run, and + * > testOnly us.jubat.jubaql_server.processor.* -- -n jubaql.LocalTest + * to include only LocalTest-tagged tests in the run. + */ + +// used for tests that require no network connection or any external service +object LocalTest extends Tag("jubaql.LocalTest") + +// used for tests that require Jubatus being installed locally +object JubatusTest extends Tag("jubaql.JubatusTest") + +// used for tests that use a HDFS installation in some network +object HDFSTest extends Tag("jubaql.HDFSTest") + +// used for tests that use a Kafka installation in some network +object KafkaTest extends Tag("jubaql.KafkaTest") diff --git a/processor/start-script/.keep b/processor/start-script/.keep new file mode 100644 index 0000000..e69de29