From fe4df9ecd97f44c2365763f9cc13f8c08f934918 Mon Sep 17 00:00:00 2001 From: Tobias Pfeiffer Date: Mon, 29 Jun 2015 14:10:35 +0900 Subject: [PATCH] release v1.3.0 --- Changelog.md | 68 + README.md | 34 +- gateway/build.sbt | 2 +- .../jubaql_server/gateway/GatewayPlan.scala | 13 +- .../jubaql_server/gateway/JubaQLGateway.scala | 41 +- .../src/test/resources/processor-logfile.jar | Bin 728 -> 754 bytes .../jubaql_server/gateway/GatewayServer.scala | 4 +- increase-version.sh | 22 + processor/NOTICE_SPARK | 574 +++ processor/assembly.sbt | 22 +- processor/build.sbt | 38 +- processor/project/assembly.sbt | 2 +- processor/project/build.properties | 1 + processor/project/deptree.sbt | 2 +- .../src/main/resources/log4j-spark-submit.xml | 70 + processor/src/main/resources/log4j.xml | 14 +- .../dstream/OrderedFileInputDStream.scala | 39 +- .../processor/AggregateFunctions.scala | 281 ++ .../processor/DatumExtractor.scala | 282 ++ .../processor/HandleExceptions.scala | 2 + .../jubaql_server/processor/Helpers.scala | 47 + .../processor/HybridProcessor.scala | 261 +- .../processor/JavaScriptHelpers.scala | 150 + .../processor/JavaScriptUDFManager.scala | 131 + .../jubaql_server/processor/JubaQLAST.scala | 42 +- ...erPrediction.scala => JubaQLContext.scala} | 7 +- .../processor/JubaQLParser.scala | 324 +- .../processor/JubaQLPatternLayout.scala | 46 + .../processor/JubaQLProcessor.scala | 16 +- .../processor/JubaQLService.scala | 1484 +++++-- .../processor/PreparedJubaQLStatement.scala | 60 + .../processor/RegistrationHandler.scala | 2 +- .../processor/SchemaDStream.scala | 242 ++ .../processor/SlidingWindow.scala | 655 +++ .../processor/StringWrapper.scala | 3 + ...sifierResult.scala => AnalyzeResult.scala} | 14 + ...{AnomalyScore.scala => ErrorMessage.scala} | 2 +- ...DatumResult.scala => JubaQLResponse.scala} | 15 +- .../processor/json/Register.scala | 2 +- .../processor/json/Unregister.scala | 2 +- .../processor/udf/AggregateFunctions.scala | 413 ++ .../udf/OrderedValueRDDFunctions.scala | 81 + .../processor/updater/Anomaly.scala | 99 +- .../processor/updater/Classifier.scala | 150 +- .../processor/updater/HttpClientPerJvm.scala | 39 +- .../processor/updater/JubatusClient.scala | 154 + .../processor/updater/Recommender.scala | 173 +- .../processor/updater/Updater.scala | 54 - processor/src/test/resources/dummydata/1.json | 2 - processor/src/test/resources/dummydata/2.json | 2 - .../src/test/resources/dummydata/data.json | 12 + processor/src/test/resources/lof.json | 16 - .../test/resources/npb_similar_player.json | 12 - processor/src/test/resources/shogun.json | 14 - .../src/test/resources/shogun_alpha_data.json | 44 + processor/src/test/resources/shogun_full.json | 20 + .../resources/shogun_splitted_name_data.json | 44 + .../org/apache/spark/LocalSparkContext.scala | 65 + .../org/apache/spark/SharedSparkContext.scala | 42 + .../apache/spark/sql/json/JsonRDDCopy.scala | 398 ++ .../processor/AggregateFunctionSpec.scala | 223 + .../processor/HasKafkaPath.scala | 2 +- .../processor/HybridProcessorSpec.scala | 447 +- .../processor/JavaScriptSpec.scala | 264 ++ .../processor/JubaQLExtractorSpec.scala | 504 +++ .../processor/JubaQLParserSpec.scala | 344 +- .../processor/JubaQLProcessorSpec.scala | 564 --- .../processor/JubaQLServiceHelperSpec.scala | 29 +- .../LocalJubatusApplicationSpec.scala | 23 + .../jubaql_server/processor/ProcessUtil.scala | 12 + .../processor/RegistrationSpec.scala | 14 +- .../processor/SchemaDStreamSpec.scala | 177 + .../processor/SlidingStreamSpec.scala | 278 ++ .../jubaql_server/processor/TestTags.scala | 3 + .../integration/JubaQLProcessorSpec.scala | 3839 +++++++++++++++++ .../udf/AggregateFunctionsTest.scala | 388 ++ .../udf/ReferenceImplementation.scala | 157 + .../udf/ReferenceImplementationTest.scala | 143 + 78 files changed, 13029 insertions(+), 1232 deletions(-) create mode 100644 Changelog.md create mode 100755 increase-version.sh create mode 100644 processor/NOTICE_SPARK create mode 100644 processor/project/build.properties create mode 100644 processor/src/main/resources/log4j-spark-submit.xml create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/AggregateFunctions.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/DatumExtractor.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/Helpers.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptHelpers.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptUDFManager.scala rename processor/src/main/scala/us/jubat/jubaql_server/processor/{json/ClassifierPrediction.scala => JubaQLContext.scala} (79%) create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLPatternLayout.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/PreparedJubaQLStatement.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/SchemaDStream.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/SlidingWindow.scala rename processor/src/main/scala/us/jubat/jubaql_server/processor/json/{ClassifierResult.scala => AnalyzeResult.scala} (70%) rename processor/src/main/scala/us/jubat/jubaql_server/processor/json/{AnomalyScore.scala => ErrorMessage.scala} (95%) rename processor/src/main/scala/us/jubat/jubaql_server/processor/json/{DatumResult.scala => JubaQLResponse.scala} (68%) create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/udf/AggregateFunctions.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/udf/OrderedValueRDDFunctions.scala create mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/updater/JubatusClient.scala delete mode 100644 processor/src/main/scala/us/jubat/jubaql_server/processor/updater/Updater.scala delete mode 100644 processor/src/test/resources/dummydata/1.json delete mode 100644 processor/src/test/resources/dummydata/2.json create mode 100644 processor/src/test/resources/dummydata/data.json create mode 100644 processor/src/test/resources/shogun_alpha_data.json create mode 100644 processor/src/test/resources/shogun_full.json create mode 100644 processor/src/test/resources/shogun_splitted_name_data.json create mode 100644 processor/src/test/scala/org/apache/spark/LocalSparkContext.scala create mode 100644 processor/src/test/scala/org/apache/spark/SharedSparkContext.scala create mode 100644 processor/src/test/scala/org/apache/spark/sql/json/JsonRDDCopy.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/AggregateFunctionSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/JavaScriptSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLExtractorSpec.scala delete mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/JubaQLProcessorSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/ProcessUtil.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/SchemaDStreamSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/SlidingStreamSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/integration/JubaQLProcessorSpec.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/udf/AggregateFunctionsTest.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/udf/ReferenceImplementation.scala create mode 100644 processor/src/test/scala/us/jubat/jubaql_server/processor/udf/ReferenceImplementationTest.scala diff --git a/Changelog.md b/Changelog.md new file mode 100644 index 0000000..1785c54 --- /dev/null +++ b/Changelog.md @@ -0,0 +1,68 @@ +Changelog +========= + +1.3.0 +----- + +### New Features + +* Cascaded Processing + + * The concept of a user-defined "stream" was introduced. + Similar to `CREATE VIEW` in SQL, `CREATE STREAM name FROM SELECT ...` + allows to create a stream holding the results of a SELECT query over + some input stream. + * In particular, `ANALYZE` results can be added to a stream in a new + column and used further down in the processing pipeline. + * A user can define custom functions in JavaScript and use them in + queries using the `CREATE FUNCTION` statement. + * Multiple data sources can be defined and used one after another + for updating/analyzing a model. + +* Trigger-Based Action + + * A user can also define functions without a return value using + `CREATE TRIGGER FUNCTION` and attach them as trigger on a stream + using `CREATE TRIGGER`. This can be used to act based on the contents + of a stream, in particular analysis results. + +* Time-Series Analysis using Sliding Windows + + * To analyze time-series data, sliding windows over an input stream + (based on either item count or an embedded timestamp) can be computed + and data in each window aggregated using a set of provided functions + such as standard deviation or histogram. + * The results of this aggregation can be used like any other data + stream. + +* Other + + * It is now possible to do feature extraction using user-defined + functions. + +### Breaking Changes + +* `CREATE DATASOURCE` + + * A schema should now in general be provided, as in + many cases schema inference will lead to errors whenever an empty + data batch is encountered. + +* `CREATE MODEL` + + * The syntax for specifying the label/id column has changed from + `model_name WITH (label: "class", datum: "name")` to + `model_name (label: class) AS ...` + * Feature converters are not specified in the JSON configuration any + more, but instead with a `column WITH converter` syntax. + +* `UPDATE MODEL` + + * The statement will only establish the connection between stream and + model, processing will not start yet. This will be done by + `START PROCESSING`. + +1.2.0 +----- + +This is the first public release. See the documentation for features and usage information. diff --git a/README.md b/README.md index 44f1780..022e236 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,12 @@ How to get started with JubaQL ### Development Setup -* Get a Hadoop-enabled version of Spark 1.1.1: - `wget http://d3kbcqa49mib13.cloudfront.net/spark-1.1.1-bin-hadoop2.4.tgz` +* Get a Hadoop-enabled version of Spark 1.2.2: + `wget http://d3kbcqa49mib13.cloudfront.net/spark-1.2.2-bin-hadoop2.4.tgz` and unpack it somewhere: - `tar -xzf spark-1.1.1-bin-hadoop2.4.tgz && export SPARK_DIST="$(pwd)/spark-1.1.1-bin-hadoop2.4/"` + `tar -xzf spark-1.2.2-bin-hadoop2.4.tgz && export SPARK_DIST="$(pwd)/spark-1.2.2-bin-hadoop2.4/"` * Install Jubatus. -* Get JubaQLClient and JubaQLServer (consists of JubaQLProcessor and JubaQLGateway): +* Get JubaQL-Client and JubaQL-Server: `git clone https://github.com/jubatus/jubaql-client.git` `git clone https://github.com/jubatus/jubaql-server.git` * Build the JubaQL components: @@ -22,16 +22,17 @@ How to get started with JubaQL * JubaQLGateway: `cd jubaql-server/gateway && sbt assembly && cd ../..` * Start the JubaQLGateway: - `cd jubaql-server && java -Dspark.distribution="$SPARK_DIST" -Djubaql.processor.fatjar=processor/target/scala-2.10/jubaql-processor-assembly-1.2.0.jar -jar gateway/target/scala-2.10/jubaql-gateway-assembly-1.2.0.jar -i 127.0.0.1` + `cd jubaql-server && java -Dspark.distribution="$SPARK_DIST" -Djubaql.processor.fatjar=processor/target/scala-2.10/jubaql-processor-assembly-1.3.0.jar -jar gateway/target/scala-2.10/jubaql-gateway-assembly-1.3.0.jar -i 127.0.0.1` * In a different shell, start the JubaQLClient: `./jubaql-client/target/start` -* You will see the prompt `jubaql>` in the shell and you will in fact be able to type your commands there, but until the JubaQLProcessor is up and running correctly, you will get the message "Unexpected response status: 503". +* You will see the prompt `jubaql>` in the shell and you will in fact be able to type your commands there, but until the JubaQLProcessor is up and running correctly, you will see the message: "This session has not been registered. Wait a second." In order to test that your setup is working correctly, you can do a simple classification using the data from the [shogun example](https://github.com/jubatus/jubatus-example/tree/master/shogun). Run the following JubaQL commands in the client: -* `CREATE CLASSIFIER MODEL test WITH (label: "label", datum: "name") config = '{"method": "AROW","converter": { "num_filter_types": {}, "num_filter_rules": [], "string_filter_types": {}, "string_filter_rules": [], "num_types": {}, "num_rules": [],"string_types": {"unigram": { "method": "ngram", "char_num": "1" }},"string_rules": [{ "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } ]},"parameter": {"regularization_weight" : 1.0}}'` +* `CREATE CLASSIFIER MODEL test (label: label) AS name WITH unigram CONFIG '{"method": "AROW", "parameter": {"regularization_weight" : 1.0}}'` * `CREATE DATASOURCE shogun (label string, name string) FROM (STORAGE: "file://data/shogun_data.json")` * `UPDATE MODEL test USING train FROM shogun` +* `START PROCESSING shogun` * `ANALYZE '{"name": "慶喜"}' BY MODEL test USING classify` * `SHUTDOWN` @@ -42,7 +43,7 @@ The JSON returned by the `ANALYZE` statement should indicate that the label "徳 * Set up a Hadoop cluster with YARN and HDFS in place. * Install Jubatus on all cluster nodes. * Get JubaQL and compile it as described above. (This time, Jubatus is not required locally.) -* Install the [Jubatus on YARN](https://github.com/jubatus/jubatus-on-yarn) libraries in HDFS as described in [the instructions](https://github.com/jubatus/jubatus-on-yarn/blob/master/document/%E3%83%93%E3%83%AB%E3%83%89%E3%83%BB%E5%88%A9%E7%94%A8%E6%89%8B%E9%A0%86%E6%9B%B8.md#%E5%AE%9F%E8%A1%8C%E3%81%AB%E5%BF%85%E8%A6%81%E3%81%AA%E3%83%95%E3%82%A1%E3%82%A4%E3%83%AB%E3%81%AE%E6%BA%96%E5%82%99). Make sure that the HDFS directory `/jubatus-on-yarn/application-master/jubaconfig/` exists and is writeable by the user running the JubaQLProcessor application. +* Install the [Jubatus on YARN](https://github.com/jubatus/jubatus-on-yarn) libraries in HDFS as described in [the instructions](https://github.com/jubatus/jubatus-on-yarn/blob/master/document/instruction.md#required-files). Make sure that the HDFS directory `/jubatus-on-yarn/application-master/jubaconfig/` exists and is writeable by the user running the JubaQLProcessor application. * To test the setup, also copy the file `shogun-data.json` from the JubaQL source tree's `data/` directory to `/jubatus-on-yarn/sample/shogun_data.json` in HDFS. * Copy the files `core-site.xml`, `yarn-site.xml`, `hdfs-site.xml` containing your Hadoop setup description from one of your cluster nodes to some directory and point the environment variable `HADOOP_CONF_DIR` to that directory. * Get your local computer's IP address that points towards the cluster. On Linux, given the IP address of one of your cluster nodes, this should be possible with something like: @@ -50,26 +51,29 @@ The JSON returned by the `ANALYZE` statement should indicate that the label "徳 Make sure that this IP address can be connected to from the cluster nodes and no firewall rules etc. are blocking access. * Get the addresses of your Zookeeper nodes and concatenate their `host:port` locations with a comma: `export MY_ZOOKEEPER=zk1:2181,zk2:2181` +* Locate a temporary directory in HDFS that Spark can use for checkpointing: + `export CHECKPOINT=hdfs:///tmp/spark` * Start the JubaQLGateway: - `cd jubaql-server` - `java -Drun.mode=production -Djubaql.zookeeper=$MY_ZOOKEEPER -Dspark.distribution="$SPARK_DIST" -Djubaql.processor.fatjar=processor/target/scala-2.10/jubaql-processor-assembly-1.2.0.jar -jar gateway/target/scala-2.10/jubaql-gateway-assembly-1.2.0.jar -i $MY_IP` + `cd jubaql-server` + `java -Drun.mode=production -Djubaql.checkpointdir=$CHECKPOINT -Djubaql.zookeeper=$MY_ZOOKEEPER -Dspark.distribution="$SPARK_DIST" -Djubaql.processor.fatjar=processor/target/scala-2.10/jubaql-processor-assembly-1.3.0.jar -jar gateway/target/scala-2.10/jubaql-gateway-assembly-1.3.0.jar -i $MY_IP` * In a different shell, start the JubaQLClient: `./jubaql-client/target/start` -* You will see the prompt `jubaql>` in the shell and you will in fact be able to type your commands there, but until the JubaQLProcessor is up and running correctly, you will get the message "Unexpected response status: 503". +* You will see the prompt `jubaql>` in the shell and you will in fact be able to type your commands there, but until the JubaQLProcessor is up and running correctly, you will see the message: "This session has not been registered. Wait a second." In order to test that your setup is working correctly, you can do a simple classification using the `shogun-data.json` file you copied to HDFS before. Run the following JubaQL commands in the client: -* `CREATE CLASSIFIER MODEL test WITH (label: "label", datum: "name") config = '{"method": "AROW","converter": { "num_filter_types": {}, "num_filter_rules": [], "string_filter_types": {}, "string_filter_rules": [], "num_types": {}, "num_rules": [],"string_types": {"unigram": { "method": "ngram", "char_num": "1" }},"string_rules": [{ "key": "*", "type": "unigram", "sample_weight": "bin", "global_weight": "bin" } ]},"parameter": {"regularization_weight" : 1.0}}'` +* `CREATE CLASSIFIER MODEL test (label: label) AS name WITH unigram CONFIG '{"method": "AROW", "parameter": {"regularization_weight" : 1.0}}'` * `CREATE DATASOURCE shogun (label string, name string) FROM (STORAGE: "hdfs:///jubatus-on-yarn/sample/shogun_data.json")` * `UPDATE MODEL test USING train FROM shogun` +* `START PROCESSING shogun` * `ANALYZE '{"name": "慶喜"}' BY MODEL test USING classify` * `SHUTDOWN` -The JSON returned by the `ANALYZE` statement should indicate that the label "徳川" has the highest score. +The JSON returned by the `ANALYZE` statement should indicate that the label "徳川" has the highest score. Note that the score may differ than in development since multiple Jubatus instances are used for training. Note: -* When the JubaQLProcessor is started using `spark-submit` as outlined above, it will first upload the `spark-assembly-1.1.1-hadoop2.4.0.jar` and `jubaql-processor-assembly-1.2.0.jar` to the cluster and add them to HDFS, from where they will be downloaded by each executor. -* It is possible to skip the upload of the Spark libraries by copying the Spark jar file to HDFS manually and adding the parameter `-Dspark.yarn.jar=hdfs:///path/to/spark-assembly-1.1.1-hadoop2.4.0.jar` when starting the JubaQLGateway. + +* When the JubaQLProcessor is started, first the files `spark-assembly-1.2.2-hadoop2.4.0.jar` and `jubaql-processor-assembly-1.3.0.jar` will be uploaded to the cluster and added to HDFS, from where they will be downloaded by each executor. It is possible to skip the upload of the Spark libraries by copying the Spark jar file to HDFS manually and adding the parameter `-Dspark.yarn.jar=hdfs:///path/to/spark-assembly-1.2.2-hadoop2.4.0.jar` when starting the JubaQLGateway. * In theory, it is also possible to do the same for the JubaQLProcessor application jar file. However, at the moment we rely on extracting a `log4j.xml` file from that jar locally before upload, so there is no support for also storing that file in HDFS, yet. ### Run on YARN with remote gateway diff --git a/gateway/build.sbt b/gateway/build.sbt index bb46465..7a816fd 100644 --- a/gateway/build.sbt +++ b/gateway/build.sbt @@ -1,6 +1,6 @@ name := "JubaQL Gateway" -version := "1.2.0" +version := "1.3.0" // use an older version than necessary to use the same set of dependencies // across projects diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala index 3f41df9..7f30aec 100644 --- a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/GatewayPlan.scala @@ -39,7 +39,8 @@ import java.nio.file.{StandardCopyOption, Files} @io.netty.channel.ChannelHandler.Sharable class GatewayPlan(ipAddress: String, port: Int, envpForProcessor: Array[String], runMode: RunMode, - sparkDistribution: String, fatjar: String) + sparkDistribution: String, fatjar: String, + checkpointDir: String) extends cycle.Plan /* With cycle.SynchronousExecution, there is a group of N (16?) threads (named "nioEventLoopGroup-5-*") that will process N requests in @@ -77,7 +78,7 @@ class GatewayPlan(ipAddress: String, port: Int, */ val tmpLog4jPath: String = try { val jar = new JarFile(new File(fatjar)) - val log4jFile = jar.getEntry("log4j.xml") + val log4jFile = jar.getEntry("log4j-spark-submit.xml") val log4jIs = jar.getInputStream(log4jFile) val tmpFile = File.createTempFile("log4j", ".xml") Files.copy(log4jIs, tmpFile.toPath, StandardCopyOption.REPLACE_EXISTING) @@ -88,7 +89,7 @@ class GatewayPlan(ipAddress: String, port: Int, logger.error("failed to create temporary log4j.xml copy: " + e.getMessage) throw e } - logger.debug("extracted log4j.xml file to %s".format(tmpLog4jPath)) + logger.debug("extracted log4j-spark-submit.xml file to %s".format(tmpLog4jPath)) val errorMsgContentType = ContentType("text/plain; charset=utf-8") @@ -131,7 +132,8 @@ class GatewayPlan(ipAddress: String, port: Int, // double-escaped on their way to the Spark driver and probably never end // up there. cmd.update(6, "spark.driver.extraJavaOptions=-Drun.mode=production " + - s"-Djubaql.zookeeper=$zookeeper") // --conf + s"-Djubaql.zookeeper=$zookeeper " + + s"-Djubaql.checkpointdir=$checkpointDir") // --conf // also specify the location of the Spark jar file, if given val sparkJarParams = sparkJar match { case Some(url) => "--conf" :: s"spark.yarn.jar=$url" :: Nil @@ -152,7 +154,7 @@ class GatewayPlan(ipAddress: String, port: Int, val isr = new InputStreamReader(is) val br = new BufferedReader(isr) var line: String = br.readLine() - while (line != null && line.trim != "yarnAppState: RUNNING") { + while (line != null && !line.trim.contains("state: RUNNING")) { if (line.contains("Exception")) { logger.error(line) throw new RuntimeException("could not start spark-submit") @@ -167,6 +169,7 @@ class GatewayPlan(ipAddress: String, port: Int, case RunMode.Development(numThreads) => cmd.update(4, s"local[$numThreads]") // --master cmd.update(6, "run.mode=development") // --conf + cmd.insertAll(7, Seq("--conf", s"jubaql.checkpointdir=$checkpointDir")) logger.debug("executing: " + cmd.mkString(" ")) Try { diff --git a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala index 62817f9..ff4be71 100644 --- a/gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala +++ b/gateway/src/main/scala/us/jubat/jubaql_server/gateway/JubaQLGateway.scala @@ -80,17 +80,13 @@ object JubaQLGateway extends LazyLogging { } logger.info("Starting in run mode %s".format(runMode)) - val sparkDistribution: String = System.getProperty("spark.distribution") - if (sparkDistribution == null || sparkDistribution.trim.isEmpty) { - System.err.println("No spark.distribution property") - System.exit(1) - } - val fatjar: String = System.getProperty("jubaql.processor.fatjar") - if (fatjar == null || fatjar.trim.isEmpty) { - System.err.println("No jubaql.processor.fatjar") - System.exit(1) - } - val plan = new GatewayPlan(ipAddress, port, envp, runMode, sparkDistribution, fatjar) + val sparkDistribution: String = getPropertyOrExitIfEmpty("spark.distribution") + val fatjar: String = getPropertyOrExitIfEmpty("jubaql.processor.fatjar") + val checkpointDir = getCheckpointDir(runMode) + val plan = new GatewayPlan(ipAddress, port, envp, runMode, + sparkDistribution = sparkDistribution, + fatjar = fatjar, + checkpointDir = checkpointDir) val nettyServer = unfiltered.netty.Server.http(port).plan(plan) logger.info("JubaQLGateway starting") nettyServer.run() @@ -114,6 +110,29 @@ object JubaQLGateway extends LazyLogging { parser.parse(args, CommandlineOptions()) } + + private def getPropertyOrExitIfEmpty(name: String): String = { + val prop = scala.util.Properties.propOrElse(name, "") + if (prop.trim.isEmpty) { + System.err.println(s"No ${name} property") + System.exit(1) + } + prop + } + + private def getCheckpointDir(runMode: RunMode): String = { + val dir = scala.util.Properties.propOrElse("jubaql.checkpointdir", "") + if (dir.trim.isEmpty) { + runMode match { + case RunMode.Production(_, _, _, _) => + "hdfs:///tmp/spark" + case RunMode.Development(_) => + "file:///tmp/spark" + } + } else { + dir + } + } } case class CommandlineOptions(ip: String = "", port: Int = JubaQLGateway.defaultPort) diff --git a/gateway/src/test/resources/processor-logfile.jar b/gateway/src/test/resources/processor-logfile.jar index 050da0bd9889ddb0f0d712c5ad0e75f4b86066fa..4ec47efbae310ab76e7275f7f16a7ff3232bdc91 100644 GIT binary patch delta 68 zcmcb?`iXUdl$aQU3`0(Sx=EI9aY15Hwr+80Qf_9+M&%qPMzP5gn8bKd#3wIf(%_0= LU|?uq0ul@W>6{a> delta 59 zcmeywdV_U>6elNx3`0(Sx=GeXgB&JtP6i1E5MTu2EsY=+SZ?wGCJiosAg`DSBnAMA CISX|F diff --git a/gateway/src/test/scala/us/jubat/jubaql_server/gateway/GatewayServer.scala b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/GatewayServer.scala index 7743057..d1d531a 100644 --- a/gateway/src/test/scala/us/jubat/jubaql_server/gateway/GatewayServer.scala +++ b/gateway/src/test/scala/us/jubat/jubaql_server/gateway/GatewayServer.scala @@ -22,7 +22,9 @@ trait GatewayServer extends BeforeAndAfterAll { protected val plan = new GatewayPlan("example.com", 1234, Array(), RunMode.Test, - "", "src/test/resources/processor-logfile.jar") + sparkDistribution = "", + fatjar = "src/test/resources/processor-logfile.jar", + checkpointDir = "file:///tmp/spark") protected val server = unfiltered.netty.Server.http(9877).plan(plan) override protected def beforeAll() = { diff --git a/increase-version.sh b/increase-version.sh new file mode 100755 index 0000000..b96d53a --- /dev/null +++ b/increase-version.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +if [ $# -ne 1 ]; then + echo "Usage: increase-version.sh toversion" + exit 1 +fi + +OLDVERSION=$(grep "version := " processor/build.sbt | sed 's/[^"]*"\([^"]*\).*/\1/') +NEWVERSION=$1 + +echo "Bumping version from $OLDVERSION to $NEWVERSION ..." + +sed -i "s/$OLDVERSION/$NEWVERSION/g" */build.sbt + +sed -i "s/$OLDVERSION/$NEWVERSION/g" README.md + +echo "Checking for old occurences of $OLDVERSION ..." + +grep -F -R "$OLDVERSION" */src +grep -F --directories=skip "$OLDVERSION" */* +grep -F --directories=skip "$OLDVERSION" * + diff --git a/processor/NOTICE_SPARK b/processor/NOTICE_SPARK new file mode 100644 index 0000000..452aef2 --- /dev/null +++ b/processor/NOTICE_SPARK @@ -0,0 +1,574 @@ +Apache Spark +Copyright 2014 The Apache Software Foundation. + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + + +======================================================================== +Common Development and Distribution License 1.0 +======================================================================== + +The following components are provided under the Common Development and Distribution License 1.0. See project link for details. + + (CDDL 1.0) Glassfish Jasper (org.mortbay.jetty:jsp-2.1:6.1.14 - http://jetty.mortbay.org/project/modules/jsp-2.1) + (CDDL 1.0) Servlet Specification 2.5 API (org.mortbay.jetty:servlet-api-2.5:6.1.14 - http://jetty.mortbay.org/project/modules/servlet-api-2.5) + (COMMON DEVELOPMENT AND DISTRIBUTION LICENSE (CDDL) Version 1.0) (GNU General Public Library) Streaming API for XML (javax.xml.stream:stax-api:1.0-2 - no url defined) + (Common Development and Distribution License (CDDL) v1.0) JavaBeans Activation Framework (JAF) (javax.activation:activation:1.1 - http://java.sun.com/products/javabeans/jaf/index.jsp) + +======================================================================== +Common Development and Distribution License 1.1 +======================================================================== + +The following components are provided under the Common Development and Distribution License 1.1. See project link for details. + + (CDDL 1.1) (GPL2 w/ CPE) JAXB API bundle for GlassFish V3 (javax.xml.bind:jaxb-api:2.2.2 - https://jaxb.dev.java.net/) + (CDDL 1.1) (GPL2 w/ CPE) JAXB RI (com.sun.xml.bind:jaxb-impl:2.2.3-1 - http://jaxb.java.net/) + (CDDL 1.1) (GPL2 w/ CPE) jersey-core (com.sun.jersey:jersey-core:1.8 - https://jersey.dev.java.net/jersey-core/) + (CDDL 1.1) (GPL2 w/ CPE) jersey-core (com.sun.jersey:jersey-core:1.9 - https://jersey.java.net/jersey-core/) + (CDDL 1.1) (GPL2 w/ CPE) jersey-guice (com.sun.jersey.contribs:jersey-guice:1.9 - https://jersey.java.net/jersey-contribs/jersey-guice/) + (CDDL 1.1) (GPL2 w/ CPE) jersey-json (com.sun.jersey:jersey-json:1.8 - https://jersey.dev.java.net/jersey-json/) + (CDDL 1.1) (GPL2 w/ CPE) jersey-json (com.sun.jersey:jersey-json:1.9 - https://jersey.java.net/jersey-json/) + (CDDL 1.1) (GPL2 w/ CPE) jersey-server (com.sun.jersey:jersey-server:1.8 - https://jersey.dev.java.net/jersey-server/) + (CDDL 1.1) (GPL2 w/ CPE) jersey-server (com.sun.jersey:jersey-server:1.9 - https://jersey.java.net/jersey-server/) + +======================================================================== +Common Public License 1.0 +======================================================================== + +The following components are provided under the Common Public 1.0 License. See project link for details. + + (Common Public License Version 1.0) JUnit (junit:junit-dep:4.10 - http://junit.org) + (Common Public License Version 1.0) JUnit (junit:junit:3.8.1 - http://junit.org) + (Common Public License Version 1.0) JUnit (junit:junit:4.8.2 - http://junit.org) + +======================================================================== +Eclipse Public License 1.0 +======================================================================== + +The following components are provided under the Eclipse Public License 1.0. See project link for details. + + (Eclipse Public License - Version 1.0) mqtt-client (org.eclipse.paho:mqtt-client:0.4.0 - http://www.eclipse.org/paho/mqtt-client) + (Eclipse Public License v1.0) Eclipse JDT Core (org.eclipse.jdt:core:3.1.1 - http://www.eclipse.org/jdt/) + +======================================================================== +Mozilla Public License 1.0 +======================================================================== + +The following components are provided under the Mozilla Public License 1.0. See project link for details. + + (GPL) (LGPL) (MPL) JTransforms (com.github.rwl:jtransforms:2.4.0 - http://sourceforge.net/projects/jtransforms/) + (Mozilla Public License Version 1.1) jamon-runtime (org.jamon:jamon-runtime:2.3.1 - http://www.jamon.org/jamon-runtime/) + + + +======================================================================== +NOTICE files +======================================================================== + +The following NOTICEs are pertain to software distributed with this project. + + +// ------------------------------------------------------------------ +// NOTICE file corresponding to the section 4d of The Apache License, +// Version 2.0, in this case for +// ------------------------------------------------------------------ + +Apache Avro +Copyright 2009-2013 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Apache Commons Codec +Copyright 2002-2009 The Apache Software Foundation + +This product includes software developed by +The Apache Software Foundation (http://www.apache.org/). + +-------------------------------------------------------------------------------- +src/test/org/apache/commons/codec/language/DoubleMetaphoneTest.java contains +test data from http://aspell.sourceforge.net/test/batch0.tab. + +Copyright (C) 2002 Kevin Atkinson (kevina@gnu.org). Verbatim copying +and distribution of this entire article is permitted in any medium, +provided this notice is preserved. +-------------------------------------------------------------------------------- + +Apache HttpComponents HttpClient +Copyright 1999-2011 The Apache Software Foundation + +This project contains annotations derived from JCIP-ANNOTATIONS +Copyright (c) 2005 Brian Goetz and Tim Peierls. See http://www.jcip.net + +Apache HttpComponents HttpCore +Copyright 2005-2011 The Apache Software Foundation + +Curator Recipes +Copyright 2011-2014 The Apache Software Foundation + +Curator Framework +Copyright 2011-2014 The Apache Software Foundation + +Curator Client +Copyright 2011-2014 The Apache Software Foundation + +Apache Geronimo +Copyright 2003-2008 The Apache Software Foundation + +Activation 1.1 +Copyright 2003-2007 The Apache Software Foundation + +Apache Commons Lang +Copyright 2001-2014 The Apache Software Foundation + +This product includes software from the Spring Framework, +under the Apache License 2.0 (see: StringUtils.containsWhitespace()) + +Apache log4j +Copyright 2007 The Apache Software Foundation + +# Compress LZF + +This library contains efficient implementation of LZF compression format, +as well as additional helper classes that build on JDK-provided gzip (deflat) +codec. + +## Licensing + +Library is licensed under Apache License 2.0, as per accompanying LICENSE file. + +## Credit + +Library has been written by Tatu Saloranta (tatu.saloranta@iki.fi). +It was started at Ning, inc., as an official Open Source process used by +platform backend, but after initial versions has been developed outside of +Ning by supporting community. + +Other contributors include: + +* Jon Hartlaub (first versions of streaming reader/writer; unit tests) +* Cedrik Lime: parallel LZF implementation + +Various community members have contributed bug reports, and suggested minor +fixes; these can be found from file "VERSION.txt" in SCM. + +Objenesis +Copyright 2006-2009 Joe Walnes, Henri Tremblay, Leonardo Mesquita + +Apache Commons Net +Copyright 2001-2010 The Apache Software Foundation + + The Netty Project + ================= + +Please visit the Netty web site for more information: + + * http://netty.io/ + +Copyright 2011 The Netty Project + +The Netty Project licenses this file to you under the Apache License, +version 2.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at: + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +License for the specific language governing permissions and limitations +under the License. + +Also, please refer to each LICENSE..txt file, which is located in +the 'license' directory of the distribution file, for the license terms of the +components that this product depends on. + +------------------------------------------------------------------------------- +This product contains the extensions to Java Collections Framework which has +been derived from the works by JSR-166 EG, Doug Lea, and Jason T. Greene: + + * LICENSE: + * license/LICENSE.jsr166y.txt (Public Domain) + * HOMEPAGE: + * http://gee.cs.oswego.edu/cgi-bin/viewcvs.cgi/jsr166/ + * http://viewvc.jboss.org/cgi-bin/viewvc.cgi/jbosscache/experimental/jsr166/ + +This product contains a modified version of Robert Harder's Public Domain +Base64 Encoder and Decoder, which can be obtained at: + + * LICENSE: + * license/LICENSE.base64.txt (Public Domain) + * HOMEPAGE: + * http://iharder.sourceforge.net/current/java/base64/ + +This product contains a modified version of 'JZlib', a re-implementation of +zlib in pure Java, which can be obtained at: + + * LICENSE: + * license/LICENSE.jzlib.txt (BSD Style License) + * HOMEPAGE: + * http://www.jcraft.com/jzlib/ + +This product optionally depends on 'Protocol Buffers', Google's data +interchange format, which can be obtained at: + + * LICENSE: + * license/LICENSE.protobuf.txt (New BSD License) + * HOMEPAGE: + * http://code.google.com/p/protobuf/ + +This product optionally depends on 'SLF4J', a simple logging facade for Java, +which can be obtained at: + + * LICENSE: + * license/LICENSE.slf4j.txt (MIT License) + * HOMEPAGE: + * http://www.slf4j.org/ + +This product optionally depends on 'Apache Commons Logging', a logging +framework, which can be obtained at: + + * LICENSE: + * license/LICENSE.commons-logging.txt (Apache License 2.0) + * HOMEPAGE: + * http://commons.apache.org/logging/ + +This product optionally depends on 'Apache Log4J', a logging framework, +which can be obtained at: + + * LICENSE: + * license/LICENSE.log4j.txt (Apache License 2.0) + * HOMEPAGE: + * http://logging.apache.org/log4j/ + +This product optionally depends on 'JBoss Logging', a logging framework, +which can be obtained at: + + * LICENSE: + * license/LICENSE.jboss-logging.txt (GNU LGPL 2.1) + * HOMEPAGE: + * http://anonsvn.jboss.org/repos/common/common-logging-spi/ + +This product optionally depends on 'Apache Felix', an open source OSGi +framework implementation, which can be obtained at: + + * LICENSE: + * license/LICENSE.felix.txt (Apache License 2.0) + * HOMEPAGE: + * http://felix.apache.org/ + +This product optionally depends on 'Webbit', a Java event based +WebSocket and HTTP server: + + * LICENSE: + * license/LICENSE.webbit.txt (BSD License) + * HOMEPAGE: + * https://github.com/joewalnes/webbit + +# Jackson JSON processor + +Jackson is a high-performance, Free/Open Source JSON processing library. +It was originally written by Tatu Saloranta (tatu.saloranta@iki.fi), and has +been in development since 2007. +It is currently developed by a community of developers, as well as supported +commercially by FasterXML.com. + +Jackson core and extension components may be licensed under different licenses. +To find the details that apply to this artifact see the accompanying LICENSE file. +For more information, including possible other licensing options, contact +FasterXML.com (http://fasterxml.com). + +## Credits + +A list of contributors may be found from CREDITS file, which is included +in some artifacts (usually source distributions); but is always available +from the source code management (SCM) system project uses. + +Jackson core and extension components may licensed under different licenses. +To find the details that apply to this artifact see the accompanying LICENSE file. +For more information, including possible other licensing options, contact +FasterXML.com (http://fasterxml.com). + +mesos +Copyright 2014 The Apache Software Foundation + +Apache Thrift +Copyright 2006-2010 The Apache Software Foundation. + + Apache Ant + Copyright 1999-2013 The Apache Software Foundation + + The task is based on code Copyright (c) 2002, Landmark + Graphics Corp that has been kindly donated to the Apache Software + Foundation. + +Apache Commons IO +Copyright 2002-2012 The Apache Software Foundation + +Apache Commons Math +Copyright 2001-2013 The Apache Software Foundation + +=============================================================================== + +The inverse error function implementation in the Erf class is based on CUDA +code developed by Mike Giles, Oxford-Man Institute of Quantitative Finance, +and published in GPU Computing Gems, volume 2, 2010. +=============================================================================== + +The BracketFinder (package org.apache.commons.math3.optimization.univariate) +and PowellOptimizer (package org.apache.commons.math3.optimization.general) +classes are based on the Python code in module "optimize.py" (version 0.5) +developed by Travis E. Oliphant for the SciPy library (http://www.scipy.org/) +Copyright © 2003-2009 SciPy Developers. +=============================================================================== + +The LinearConstraint, LinearObjectiveFunction, LinearOptimizer, +RelationShip, SimplexSolver and SimplexTableau classes in package +org.apache.commons.math3.optimization.linear include software developed by +Benjamin McCann (http://www.benmccann.com) and distributed with +the following copyright: Copyright 2009 Google Inc. +=============================================================================== + +This product includes software developed by the +University of Chicago, as Operator of Argonne National +Laboratory. +The LevenbergMarquardtOptimizer class in package +org.apache.commons.math3.optimization.general includes software +translated from the lmder, lmpar and qrsolv Fortran routines +from the Minpack package +Minpack Copyright Notice (1999) University of Chicago. All rights reserved +=============================================================================== + +The GraggBulirschStoerIntegrator class in package +org.apache.commons.math3.ode.nonstiff includes software translated +from the odex Fortran routine developed by E. Hairer and G. Wanner. +Original source copyright: +Copyright (c) 2004, Ernst Hairer +=============================================================================== + +The EigenDecompositionImpl class in package +org.apache.commons.math3.linear includes software translated +from some LAPACK Fortran routines. Original source copyright: +Copyright (c) 1992-2008 The University of Tennessee. All rights reserved. +=============================================================================== + +The MersenneTwister class in package org.apache.commons.math3.random +includes software translated from the 2002-01-26 version of +the Mersenne-Twister generator written in C by Makoto Matsumoto and Takuji +Nishimura. Original source copyright: +Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura, +All rights reserved +=============================================================================== + +The LocalizedFormatsTest class in the unit tests is an adapted version of +the OrekitMessagesTest class from the orekit library distributed under the +terms of the Apache 2 licence. Original source copyright: +Copyright 2010 CS Systèmes d'Information +=============================================================================== + +The HermiteInterpolator class and its corresponding test have been imported from +the orekit library distributed under the terms of the Apache 2 licence. Original +source copyright: +Copyright 2010-2012 CS Systèmes d'Information +=============================================================================== + +The creation of the package "o.a.c.m.analysis.integration.gauss" was inspired +by an original code donated by Sébastien Brisard. +=============================================================================== + +The complete text of licenses and disclaimers associated with the the original +sources enumerated above at the time of code translation are in the LICENSE.txt +file. + +This product currently only contains code developed by authors +of specific components, as identified by the source code files; +if such notes are missing files have been created by +Tatu Saloranta. + +For additional credits (generally to people who reported problems) +see CREDITS file. + +Apache Commons Lang +Copyright 2001-2011 The Apache Software Foundation + +Apache Commons Compress +Copyright 2002-2012 The Apache Software Foundation + +Apache Commons CLI +Copyright 2001-2009 The Apache Software Foundation + +Google Guice - Extensions - Servlet +Copyright 2006-2011 Google, Inc. + +Google Guice - Core Library +Copyright 2006-2011 Google, Inc. + +Apache Jakarta HttpClient +Copyright 1999-2007 The Apache Software Foundation + +Apache Hive +Copyright 2008-2013 The Apache Software Foundation + +This product includes software developed by The Apache Software +Foundation (http://www.apache.org/). + +This product includes software developed by The JDBM Project +(http://jdbm.sourceforge.net/). + +This product includes/uses ANTLR (http://www.antlr.org/), +Copyright (c) 2003-2011, Terrence Parr. + +This product includes/uses StringTemplate (http://www.stringtemplate.org/), +Copyright (c) 2011, Terrence Parr. + +This product includes/uses ASM (http://asm.ow2.org/), +Copyright (c) 2000-2007 INRIA, France Telecom. + +This product includes/uses org.json (http://www.json.org/java/index.html), +Copyright (c) 2002 JSON.org + +This product includes/uses JLine (http://jline.sourceforge.net/), +Copyright (c) 2002-2006, Marc Prud'hommeaux . + +This product includes/uses SQLLine (http://sqlline.sourceforge.net), +Copyright (c) 2002, 2003, 2004, 2005 Marc Prud'hommeaux . + +This product includes/uses SLF4J (http://www.slf4j.org/), +Copyright (c) 2004-2010 QOS.ch + +This product includes/uses Bootstrap (http://twitter.github.com/bootstrap/), +Copyright (c) 2012 Twitter, Inc. + +This product includes/uses Glyphicons (http://glyphicons.com/), +Copyright (c) 2010 - 2012 Jan Kovarík + +This product includes DataNucleus (http://www.datanucleus.org/) +Copyright 2008-2008 DataNucleus + +This product includes Guava (http://code.google.com/p/guava-libraries/) +Copyright (C) 2006 Google Inc. + +This product includes JavaEWAH (http://code.google.com/p/javaewah/) +Copyright (C) 2011 Google Inc. + +Apache Commons Pool +Copyright 1999-2009 The Apache Software Foundation + +========================================================================= +== NOTICE file corresponding to section 4(d) of the Apache License, == +== Version 2.0, in this case for the DataNucleus distribution. == +========================================================================= + +=================================================================== +This product includes software developed by many individuals, +including the following: +=================================================================== +Erik Bengtson +Andy Jefferson + +=================================================================== +This product has included contributions from some individuals, +including the following: +=================================================================== + +=================================================================== +This product has included contributions from some individuals, +including the following: +=================================================================== +Joerg von Frantzius +Thomas Marti +Barry Haddow +Marco Schulze +Ralph Ullrich +David Ezzio +Brendan de Beer +David Eaves +Martin Taal +Tony Lai +Roland Szabo +Marcus Mennemeier +Xuan Baldauf +Eric Sultan + +=================================================================== +This product also includes software developed by the TJDO project +(http://tjdo.sourceforge.net/). +=================================================================== + +=================================================================== +This product includes software developed by many individuals, +including the following: +=================================================================== +Andy Jefferson +Erik Bengtson +Joerg von Frantzius +Marco Schulze + +=================================================================== +This product has included contributions from some individuals, +including the following: +=================================================================== +Barry Haddow +Ralph Ullrich +David Ezzio +Brendan de Beer +David Eaves +Martin Taal +Tony Lai +Roland Szabo +Anton Troshin (Timesten) + +=================================================================== +This product also includes software developed by the Apache Commons project +(http://commons.apache.org/). +=================================================================== + +Apache Java Data Objects (JDO) +Copyright 2005-2006 The Apache Software Foundation + +========================================================================= +== NOTICE file corresponding to section 4(d) of the Apache License, == +== Version 2.0, in this case for the Apache Derby distribution. == +========================================================================= + +Apache Derby +Copyright 2004-2008 The Apache Software Foundation + +Portions of Derby were originally developed by +International Business Machines Corporation and are +licensed to the Apache Software Foundation under the +"Software Grant and Corporate Contribution License Agreement", +informally known as the "Derby CLA". +The following copyright notice(s) were affixed to portions of the code +with which this file is now or was at one time distributed +and are placed here unaltered. + +(C) Copyright 1997,2004 International Business Machines Corporation. All rights reserved. + +(C) Copyright IBM Corp. 2003. + +The portion of the functionTests under 'nist' was originally +developed by the National Institute of Standards and Technology (NIST), +an agency of the United States Department of Commerce, and adapted by +International Business Machines Corporation in accordance with the NIST +Software Acknowledgment and Redistribution document at +http://www.itl.nist.gov/div897/ctg/sql_form.htm + +Apache Commons Collections +Copyright 2001-2008 The Apache Software Foundation + +Apache Commons Configuration +Copyright 2001-2008 The Apache Software Foundation + +Apache Jakarta Commons Digester +Copyright 2001-2006 The Apache Software Foundation + +Apache Commons BeanUtils +Copyright 2000-2008 The Apache Software Foundation + +Apache Avro Mapred API +Copyright 2009-2013 The Apache Software Foundation + +Apache Avro IPC +Copyright 2009-2013 The Apache Software Foundation diff --git a/processor/assembly.sbt b/processor/assembly.sbt index 5e3272c..6f8f078 100644 --- a/processor/assembly.sbt +++ b/processor/assembly.sbt @@ -1,16 +1,11 @@ -import AssemblyKeys._ - -assemblySettings - test in assembly := {} jarName in assembly := "jubaql-processor-assembly-" + version.value + ".jar" -/// We MUST include Scala libraries, otherwise scalalogging won't -/// be included: -// assemblyOption in assembly ~= { -// _.copy(includeScala = false) -// } +// Scala libraries will be provided by the runtime. +assemblyOption in assembly ~= { + _.copy(includeScala = false) +} mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) => { @@ -36,6 +31,9 @@ mergeStrategy in assembly <<= (mergeStrategy in assembly) { // commons-beanutils-core-1.8.0.jar:org/apache/commons/beanutils/BasicDynaBean.class // and others case PathList("org", "apache", xs @ _*) => MergeStrategy.last + // scala-logging-slf4j_2.10-2.1.2.jar:com/typesafe/scalalogging/slf4j/Logger$.class vs. + // scalalogging-slf4j_2.10-1.1.0.jar:com/typesafe/scalalogging/slf4j/Logger$.class + case PathList("com", "typesafe", "scalalogging", xs @ _*) => MergeStrategy.last // javax.transaction-1.1.1.v201105210645.jar:plugin.properties vs. // javax.servlet-3.0.0.v201112011016.jar:plugin.properties vs. // javax.mail.glassfish-1.4.1.v201005082020.jar:plugin.properties vs. @@ -50,12 +48,6 @@ mergeStrategy in assembly <<= (mergeStrategy in assembly) { } } -// take only the Spark and Hadoop jars out (this is more or less an -// alternative to marking Spark as "provided") -excludedJars in assembly <<= (fullClasspath in assembly) map { cp => - cp filter {item => item.data.getPath.contains("/org.apache.hadoop/")} -} - // add "provided" dependencies back to classpath when using "sbt run". // this does not affect the "run" function in IDEA (i.e., it can't be used) run in Compile <<= Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)) diff --git a/processor/build.sbt b/processor/build.sbt index 5bd2741..ce48bcc 100644 --- a/processor/build.sbt +++ b/processor/build.sbt @@ -1,9 +1,11 @@ import com.typesafe.sbt.SbtStartScript import java.io.File +import org.apache.ivy.core.module.descriptor.ExcludeRule + name := "JubaQL Processor" -version := "1.2.0" +version := "1.3.0" // use 2.10 for now (Spark has no 2.11 support yet) scalaVersion := "2.10.4" @@ -20,16 +22,19 @@ resolvers += "Cloudera Repository" at "https://repository.cloudera.com/artifacto // Add msgpack repository (sbt does not use the information provided in the Jubatus POM) resolvers += "MessagePack" at "http://msgpack.org/maven2" +// local repository +resolvers += Resolver.file("LocalRepo", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) + libraryDependencies ++= Seq( // logging "com.typesafe.scala-logging" %% "scala-logging-slf4j" % "2.1.2", - "org.slf4j" % "slf4j-api" % "1.7.7", - "org.slf4j" % "slf4j-log4j12" % "1.7.7", + "org.slf4j" % "slf4j-api" % "1.6.4", + "org.slf4j" % "slf4j-log4j12" % "1.6.4", // Jubatus - "us.jubat" % "jubatus" % "0.6.0" + "us.jubat" % "jubatus" % "0.7.1" exclude("org.jboss.netty", "netty"), // jubatusonyarn - "us.jubat" %% "jubatus-on-yarn-client" % "1.0" + "us.jubat" %% "jubatus-on-yarn-client" % "1.1" exclude("javax.servlet", "servlet-api") exclude("org.jboss.netty", "netty"), // HTTP server @@ -39,25 +44,24 @@ libraryDependencies ++= Seq( // parsing of program arguments "com.github.scopt" %% "scopt" % "3.2.0", // Spark - "org.apache.spark" %% "spark-core" % "1.1.1" % "provided", - // the following will prevent org.spark-project.akka:akka-remote_2.10:2.2.3-shaded-protobuf - // from pulling in io.netty:netty:3.6.6.Final, but it will not prevent spark-core - // itself to pull in io.netty:netty-all:4.0.23.Final (note that the former - // includes the package "org.jboss.netty", while the latter includes "io.netty".) - "org.spark-project.akka" %% "akka-remote" % "2.2.3-shaded-protobuf" - exclude("io.netty", "netty"), - "org.apache.spark" %% "spark-streaming" % "1.1.1" % "provided", - "org.apache.spark" %% "spark-streaming-kafka" % "1.1.1" + "org.apache.spark" %% "spark-core" % "1.2.2" % "provided" + excludeAll(ExclusionRule(organization = "org.slf4j")), + "org.apache.spark" %% "spark-streaming" % "1.2.2" % "provided", + "org.apache.spark" %% "spark-streaming-kafka" % "1.2.2" + exclude("org.apache.spark", "spark-streaming_2.10") exclude("commons-beanutils", "commons-beanutils") exclude("commons-collections", "commons-collections") exclude("com.esotericsoftware.minlog", "minlog"), - "org.apache.spark" %% "spark-sql" % "1.1.1", + "org.apache.spark" %% "spark-sql" % "1.2.2" + exclude("org.apache.spark", "spark-core_2.10"), // registration with the gateway "net.databinder.dispatch" %% "dispatch-core" % "0.11.2", - // HDFS - "org.apache.hadoop" % "hadoop-client" % "2.5.0-cdh5.2.0" % "provided", + // math + "org.apache.commons" % "commons-math3" % "3.5", // for testing "org.scalatest" %% "scalatest" % "2.2.1" % "test", + "org.scalacheck" %% "scalacheck" % "1.12.1" % "test", + "org.subethamail" % "subethasmtp" % "3.1.7" % "test", "net.databinder" %% "unfiltered-filter" % "0.8.2" % "test", "net.databinder" %% "unfiltered-json4s" % "0.8.2" % "test", "net.databinder" %% "unfiltered-netty-server" % "0.8.2" % "test" diff --git a/processor/project/assembly.sbt b/processor/project/assembly.sbt index 54c3252..74adde3 100644 --- a/processor/project/assembly.sbt +++ b/processor/project/assembly.sbt @@ -1 +1 @@ -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.11.2") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.13.0") diff --git a/processor/project/build.properties b/processor/project/build.properties new file mode 100644 index 0000000..a6e117b --- /dev/null +++ b/processor/project/build.properties @@ -0,0 +1 @@ +sbt.version=0.13.8 diff --git a/processor/project/deptree.sbt b/processor/project/deptree.sbt index 3c9aed7..10dfae8 100644 --- a/processor/project/deptree.sbt +++ b/processor/project/deptree.sbt @@ -1 +1 @@ -addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4") +addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.5") diff --git a/processor/src/main/resources/log4j-spark-submit.xml b/processor/src/main/resources/log4j-spark-submit.xml new file mode 100644 index 0000000..299a35f --- /dev/null +++ b/processor/src/main/resources/log4j-spark-submit.xml @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/processor/src/main/resources/log4j.xml b/processor/src/main/resources/log4j.xml index ee27083..4aaa374 100644 --- a/processor/src/main/resources/log4j.xml +++ b/processor/src/main/resources/log4j.xml @@ -4,8 +4,8 @@ - - + + @@ -51,6 +51,16 @@ + + + + + + + + + + diff --git a/processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala b/processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala index 9f13814..71231d3 100644 --- a/processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala +++ b/processor/src/main/scala/org/apache/spark/streaming/dstream/OrderedFileInputDStream.scala @@ -13,27 +13,24 @@ // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * --- - * - * Based on FileInputDStream from the Apache Spark 1.1.0 distribution. - */ - +// +// This file is based on streaming/src/main/scala/org/apache/spark/streaming/dstream/FileInputDStream.scala +// from Apache Spark 1.1.0 and incorporates code covered by the following terms: +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE_SPARK file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package org.apache.spark.streaming.dstream import java.io.{ObjectInputStream, IOException} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/AggregateFunctions.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/AggregateFunctions.scala new file mode 100644 index 0000000..836093d --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/AggregateFunctions.scala @@ -0,0 +1,281 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.types._ + +import scala.reflect.ClassTag + +sealed trait SomeAggregateFunction[IN] { + def aggFun(rdd: RDD[(Long, (Long, IN))]): RDD[(Long, Any)] + + val inType: DataType + val outType: DataType + + implicit protected def toLongAnyRDD[T: ClassTag](rdd: RDD[(Long, T)]) = + rdd.mapValues(_.asInstanceOf[Any]) +} + +trait DoubleInputAggFun extends SomeAggregateFunction[Double] { + override val inType: DataType = DoubleType +} + +object AvgFun extends DoubleInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, Double))]) = + us.jubat.jubaql_server.processor.udf.AvgFun.apply(rdd) + + override val outType: DataType = DoubleType +} + +object StdDevFun extends DoubleInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, Double))]) = + us.jubat.jubaql_server.processor.udf.StdDevFun.apply(rdd) + + override val outType: DataType = DoubleType +} + +class QuantileFun(position: Double = 0.5d) + extends us.jubat.jubaql_server.processor.udf.QuantileFun(position) + with DoubleInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, Double))]) = + apply(rdd) + + override val outType: DataType = DoubleType +} + +object LinApproxFun extends DoubleInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, Double))]) = + us.jubat.jubaql_server.processor.udf.LinApproxFun.apply(rdd).mapValues(ab => { + Map("a" -> ab._1, "b" -> ab._2).asInstanceOf[Any] + }) + + override val outType: DataType = + StructType(StructField("a", DoubleType, nullable = true) :: + StructField("b", DoubleType, nullable = true) :: Nil) +} + +object FourierCoeffsFun extends DoubleInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, Double))]) = + us.jubat.jubaql_server.processor.udf.FourierCoeffsFun.apply(rdd).mapValues(reIm => { + Map("re" -> reIm._1, "im" -> reIm._2).asInstanceOf[Any] + }) + + override val outType: DataType = + StructType(StructField("re", ArrayType(DoubleType, containsNull = false), nullable = true) :: + StructField("im", ArrayType(DoubleType, containsNull = false), nullable = true) :: Nil) +} + +object WaveletCoeffsFun extends DoubleInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, Double))]) = + us.jubat.jubaql_server.processor.udf.WaveletCoeffsFun.apply(rdd) + + override val outType: DataType = ArrayType(DoubleType, containsNull = false) +} + +class HistogramFun(lowestUpperBound: Double = 0.1, + highestLowerBound: Double = 0.9, + numBins: Int = 10) + extends us.jubat.jubaql_server.processor.udf.HistogramFun(lowestUpperBound, highestLowerBound, numBins) + with DoubleInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, Double))]) = + apply(rdd) + + override val outType: DataType = ArrayType(DoubleType, containsNull = false) +} + +trait StringInputAggFun extends SomeAggregateFunction[String] { + override val inType: DataType = StringType +} + +class ConcatFun(separator: String = " ") + extends us.jubat.jubaql_server.processor.udf.ConcatFun(separator) + with StringInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, String))]) = + apply(rdd) + + override val outType: DataType = StringType +} + +object MaxElemFun extends StringInputAggFun { + override def aggFun(rdd: RDD[(Long, (Long, String))]) = + // for some reason in (only) this case, type parameters + // to apply() must be specified explicitly + us.jubat.jubaql_server.processor.udf.MaxElemFun.apply[Long, Long, String](rdd) + + override val outType: DataType = StringType +} + +object AggregateFunctions { + type AggFunOrError = Either[String, SomeAggregateFunction[_]] + + // check parameter types for aggregate functions + + def checkAvgParams(params: List[Expression]): AggFunOrError = { + SingleParamAggFunctionChecker("avg", params, AvgFun).check + } + + def checkStdDevParams(params: List[Expression]): AggFunOrError = { + SingleParamAggFunctionChecker("stddev", params, StdDevFun).check + } + + def checkQuantileParams(params: List[Expression]): AggFunOrError = { + params match { + // version with provided p parameter + case pExp :: exp :: Nil => + if (!pExp.foldable) { + Left("first parameter to quantile must be evaluable") + } else { + if (!pExp.dataType.isInstanceOf[NumericType]) { + Left(s"wrong type of parameters for quantile (must be (numeric, numeric))") + } else { + val pBox = pExp.eval() + pBox match { + case p: Double if p >= 0 && p <= 1.0 => + Right(new QuantileFun(p)) + case _ => + Left("first parameter to quantile must be in [0,1] range") + } + } + } + + // no parameter version + case others => + SingleParamAggFunctionChecker("quantile", others, new QuantileFun()).check + } + } + + def checkLinApproxParams(params: List[Expression]): AggFunOrError = { + SingleParamAggFunctionChecker("linapprox", params, LinApproxFun).check + } + + def checkFourierParams(params: List[Expression]): AggFunOrError = { + SingleParamAggFunctionChecker("fourier", params, FourierCoeffsFun).check + } + + def checkWaveletParams(params: List[Expression]): AggFunOrError = { + SingleParamAggFunctionChecker("wavelet", params, WaveletCoeffsFun).check + } + + def checkHistogramParams(params: List[Expression]): AggFunOrError = { + params match { + // version with 3 provided parameters (bounds and number of bins) + case lubExp :: hlbExp :: binExp :: exp :: Nil => + if (!lubExp.foldable || !hlbExp.foldable || !binExp.foldable) { + Left("parameters for histogram must be evaluable") + } else { + if (!lubExp.dataType.isInstanceOf[NumericType] || + !hlbExp.dataType.isInstanceOf[NumericType] || + !binExp.dataType.isInstanceOf[IntegralType]) { + Left(s"wrong type of parameters for histogram (must be (numeric, numeric, integer, numeric))") + } else { + (lubExp.eval(), hlbExp.eval(), binExp.eval()) match { + case (lub: Double, hlb: Double, bin: Int) => + Right(new HistogramFun(lub, hlb, bin)) + case _ => + Left("wrong type of parameters for histogram, must be (double, double, int, numeric)") + } + } + } + + // version with 2 provided parameters (bounds) + case lubExp :: hlbExp :: exp :: Nil => + if (!lubExp.foldable || !hlbExp.foldable) { + Left("parameters for histogram must be evaluable") + } else { + if (!lubExp.dataType.isInstanceOf[NumericType] || + !hlbExp.dataType.isInstanceOf[NumericType]) { + Left(s"wrong type of parameters for histogram (must be (numeric, numeric, numeric))") + } else { + (lubExp.eval(), hlbExp.eval()) match { + case (lub: Double, hlb: Double) => + Right(new HistogramFun(lub, hlb)) + case _ => + Left("wrong type of parameters for histogram, must be (double, double, numeric)") + } + } + } + + // version with 1 provided parameter (bins) + case binExp :: exp :: Nil => + if (!binExp.foldable) { + Left("parameters for histogram must be evaluable") + } else { + if (!binExp.dataType.isInstanceOf[IntegralType]) { + Left(s"wrong type of parameters for histogram (must be (integer, numeric))") + } else { + (binExp.eval()) match { + case bin: Int => + Right(new HistogramFun(numBins = bin)) + case _ => + Left("wrong type of parameters for histogram, must be (int, numeric)") + } + } + } + + // no parameter version + case others => + SingleParamAggFunctionChecker("histogram", others, new HistogramFun()).check + } + } + + def checkConcatParams(params: List[Expression]): AggFunOrError = { + params match { + // version with provided p parameter + case cExp :: exp :: Nil => + if (!cExp.foldable) { + Left("first parameter to concat must be evaluable") + } else { + if (!cExp.dataType.equals(StringType)) { + Left(s"wrong type of parameters for concat (must be (string, string))") + } else { + val cBox = cExp.eval() + cBox match { + case c: String => + Right(new ConcatFun(c)) + case _ => + Left("first parameter to concat must be a string") + } + } + } + + // no parameter version + case others => + SingleParamAggFunctionChecker("concat", others, new ConcatFun()).check + } + } + + def checkMaxElemParams(params: List[Expression]): AggFunOrError = { + SingleParamAggFunctionChecker("maxelem", params, MaxElemFun).check + } + + private case class SingleParamAggFunctionChecker[T <: + SomeAggregateFunction[_]](name: String, + params: List[Expression], + obj: T) { + def check: Either[String, T] = { + params match { + case exp :: Nil => + Right(obj) + case _ => + Left(s"wrong number of parameters for $name (must be 1)") + } + } + } + +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/DatumExtractor.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/DatumExtractor.scala new file mode 100644 index 0000000..d1225ed --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/DatumExtractor.scala @@ -0,0 +1,282 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import com.typesafe.scalalogging.slf4j.Logger +import org.apache.spark.sql.{ShortType, IntegerType, LongType, FloatType, DoubleType, StringType, DataType, Row} +import org.json4s.JsonAST.JNumber +import org.json4s.native.JsonMethods +import org.json4s._ +import us.jubat.common.Datum + +import scala.collection.mutable +import scala.collection.concurrent +import scala.collection.JavaConversions._ + +object DatumExtractor { + def extract(cm: CreateModel, + data: String, + featureFunctions: concurrent.Map[String, String], + logger: Logger): Datum + = extract(cm, JsonMethods.parse(data), featureFunctions, logger) + + def extract(cm: CreateModel, + data: JValue, + featureFunctions: concurrent.Map[String, String], + logger: Logger): Datum = { + // we can only process numeric and string values + val filtered: List[(String, JValue)] = data.filterField { + case JField(_, _: JNumber) | JField(_, _: JString) => true + case _ => false + } + + // fill the row with Spark-compatible values (String, Int, Long, Double) + val row = Row(filtered.map { + case (_, value: JString) => + value.s + case (_, value: JInt) if value.num.isValidInt => + value.num.toInt + case (_, value: JInt) => + // note: this may still overflow + value.num.toLong + case (_, value: JDecimal) => + value.num.toDouble + case (_, value: JDouble) => + value.num + }: _*) + // set the correct type information for Spark + val schema: Map[String, (Int, DataType)] = filtered.zipWithIndex.map { + case ((key, _: JString), ix) => + (key, (ix, StringType)) + case ((key, value: JInt), ix) if value.num.isValidInt => + (key, (ix, IntegerType)) + case ((key, _: JInt), ix) => + (key, (ix, LongType)) + case ((key, _: JNumber), ix) => + (key, (ix, DoubleType)) + }.toMap + + extract(cm, schema, row, featureFunctions, logger) + } + + def extract(cm: CreateModel, + schema: Map[String, (Int, DataType)], + row: Row, + featureFunctions: concurrent.Map[String, String], + logger: Logger): Datum = { + // schema is a mapping (column name -> (column index, column type)) + + val datum = new Datum() + val ds = new DatumSetter(datum, row, logger) + + // schemaCopy holds current candidate columns for feature extraction. + // one matches, then deleted from schemaCopy. + val schemaCopy = mutable.Map[String, (Int, DataType)](schema.toSeq: _*) + // remove a label column or a id column + cm.labelOrId.foreach(schemaCopy -= _._2) + + def getFeatureFunctionBodyByName(f: String): String = { + featureFunctions.get(f) match { + case None => + val knownFuncs = featureFunctions.keys.mkString(", ") + val msg = s"feature function '$f' is not found (known: ${knownFuncs})" + logger.error(msg) + throw new RuntimeException(msg) + case Some(funcBody) => + funcBody + } + } + + // register feature functions + cm.featureExtraction.foreach { + case (_, "id") | (_, "unigram") | (_, "bigram") => + // do nothing + + case (NormalParameters(params), funcName) => + val funcBody = getFeatureFunctionBodyByName(funcName) + JavaScriptFeatureFunctionManager.register(funcName, params.length, funcBody) + + case (_, funcName) => + val funcBody = getFeatureFunctionBodyByName(funcName) + JavaScriptFeatureFunctionManager.register(funcName, 1, funcBody) + } + + type SchemaType = Seq[(String, (Int, DataType))] + + def processWithoutFeatureFunction(column: (String, (Int, DataType))): Unit = column match { + case (colName, (rowIdx, dataType)) => + ds.setFromRow(colName, rowIdx, dataType) + schemaCopy -= colName + } + + def processWithJubatusFeatureFunction(funcName: String, column: (String, (Int, DataType))): Unit = column match { + case (colName, (rowIdx, dataType)) => + ds.setFromRow(s"$colName-$funcName-jubaconv", rowIdx, dataType) + schemaCopy -= colName + } + + def processWithFeatureFunction(columns: SchemaType, funcName: String): Unit = { + val args: Seq[AnyRef] = columns.map { + case (colName, (rowIdx, dataType)) => + ds.getFromRow(rowIdx, dataType) match { + case None => + // TODO: improve message + val msg = s"failed to get $colName" + logger.error(msg) + throw new RuntimeException(msg) + + case Some(arg) => + arg.asInstanceOf[AnyRef] + } + } + + val values = JavaScriptFeatureFunctionManager.callAndGetValues(funcName, args: _*) + + val catArgNames = columns.map(_._1).mkString(",") + val outputColNameCommon = s"$funcName#$catArgNames" + + values.foreach { + case (key, value) => + val outputColName = + // if we have a single-valued return function, omit the object key, + // otherwise add it to the datum's key string + if (values.size == 1) { + outputColNameCommon + } else { + outputColNameCommon + "#" + key + } + value match { + case s: String => + ds.set(outputColName, s) + case x: Double => + ds.set(outputColName, x) + } + } + columns.foreach(schemaCopy -= _._1) + } + + def processColumns(funcName: String, columns: Seq[(String, (Int, DataType))]) = { + funcName match { + case "id" => + columns.foreach(processWithoutFeatureFunction) + case "unigram" | "bigram" => + columns.foreach(processWithJubatusFeatureFunction(funcName, _)) + case _ => + columns.foreach { + case arg => + processWithFeatureFunction(Seq(arg), funcName) + } + } + } + + cm.featureExtraction.foreach { + // * + case (WildcardAnyParameter, funcName) => + processColumns(funcName, schemaCopy.toSeq) + + // prefix_* + case (WildcardWithPrefixParameter(prefix), funcName) => + val processedColumns = schemaCopy.filter(_._1.startsWith(prefix)).toSeq + processColumns(funcName, processedColumns) + + // *_suffix + case (WildcardWithSuffixParameter(suffix), funcName) => + val processedColumns = schemaCopy.filter(_._1.endsWith(suffix)).toSeq + processColumns(funcName, processedColumns) + + // not wildcard + case (NormalParameters(params), funcName) => + params match { + case Nil => + val msg = "should not pass here. (this may be a bug of parser)" + logger.error(msg) + throw new RuntimeException(msg) // maybe RuntimeException is inappropriate... + + case colNames => + val columns: SchemaType = params.map { + case colName => + // if we have an explicitly specified column name, + // then it does not matter whether it was used before or + // not, so we access `schema`, not `schemaCopy` + schema.get(colName) match { + case None => + val msg = s"column named '$colName' not found" + logger.error(msg) + throw new RuntimeException(msg) + case Some((rowIdx, dataType)) => + (colName, (rowIdx, dataType)) + } + } + + funcName match { + case "id" if colNames.length == 1 => + processWithoutFeatureFunction(columns.head) + case "unigram" | "bigram" if colNames.length == 1 => + processWithJubatusFeatureFunction(funcName, columns.head) + case "id" => + val msg = "attempt to call id feature function with more than one arguments" + logger.error(msg) + throw new RuntimeException(msg) + case "unigram" | "bigram" => + val msg = "attempt to call Jubatus feature function with more than one argument" + logger.error(msg) + throw new RuntimeException(msg) + case _ => + processWithFeatureFunction(columns, funcName) + } + } + } + + datum + } +} + +private class DatumSetter(d: Datum, row: Row, logger: Logger) { + def getFromRow(rowIdx: Int, dataType: DataType): Option[Any] = dataType match { + case _ if row.isNullAt(rowIdx) => + // a null value is a property of one particular data value, so + // we will just ignore this value and continue with the next + None + case StringType => + Some(row.getString(rowIdx)) + case FloatType => + Some(row.getFloat(rowIdx).toDouble) + case DoubleType => + Some(row.getDouble(rowIdx)) + case ShortType => + Some(row.getShort(rowIdx).toDouble) + case IntegerType => + Some(row.getInt(rowIdx).toDouble) + case LongType => + Some(row.getLong(rowIdx).toDouble) + case other => + logger.warn(s"cannot take value of type '$other' from row $row") + None + } + def setFromRow(colName: String, rowIdx: Int, dataType: DataType) = { + getFromRow(rowIdx, dataType) match { + case Some(s: String) => + set(colName, s) + case Some(x: Double) => + set(colName, x) + case _ => + // do nothing + } + } + + def set(colName: String, value: String) = d.addString(colName, value) + def set(colName: String, value: Double) = d.addNumber(colName, value) +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala index 0929820..d6fe798 100644 --- a/processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/HandleExceptions.scala @@ -30,6 +30,8 @@ class HandleExceptions // `handle` asynchronously handles exceptions. service(request) handle { case error => + logger.error(error.toString) + logger.error(error.getMessage) logger.error(error.getStackTraceString) val statusCode = HttpResponseStatus.INTERNAL_SERVER_ERROR val body = ("result" -> error.getMessage) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/Helpers.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/Helpers.scala new file mode 100644 index 0000000..135cddb --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/Helpers.scala @@ -0,0 +1,47 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.apache.spark.rdd.RDD +import org.joda.time.format.{DateTimeFormatterBuilder, ISODateTimeFormat} + +object Helpers { + def niceRDDString(rdd: RDD[_]): String = { + rdd.toDebugString.split('\n').map(" " + _).mkString("\n") + } + + // a date parser for 2014-11-21T15:52:21[.943321112] + protected val timestampParser = { + val fractionElem = new DateTimeFormatterBuilder() + .appendLiteral('.') + .appendFractionOfSecond(3, 9).toFormatter + new DateTimeFormatterBuilder() + .append(ISODateTimeFormat.date) + .appendLiteral('T') + .append(ISODateTimeFormat.hourMinuteSecond) + .appendOptional(fractionElem.getParser) + .toFormatter + } + + def parseTimestamp(s: String): Long = timestampParser.parseMillis(s) + + // a date formatter for 2014-11-21T15:52:21.943 + // note that this will only be used for window timestamps, so millisecond + // precision is totally ok + protected val timestampFormatter = ISODateTimeFormat.dateHourMinuteSecondMillis() + + def formatTimestamp(l: Long): String = timestampFormatter.print(l) +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala index a0c86db..eb2b97b 100644 --- a/processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/HybridProcessor.scala @@ -15,6 +15,8 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA package us.jubat.jubaql_server.processor +import RunMode.Development + import scala.concurrent.future import scala.concurrent.ExecutionContext.Implicits.global import org.apache.spark.SparkContext @@ -40,10 +42,21 @@ import org.json4s.native.JsonMethods._ // "struct" holding the number of processed items, runtime in ms and largest seen id case class ProcessingInformation(itemCount: Long, runtime: Long, maxId: Option[String]) +// an object describing the state of the processor +sealed trait ProcessorState + +case object Initialized extends ProcessorState + +case object Running extends ProcessorState + +case object Finished extends ProcessorState + class HybridProcessor(sc: SparkContext, sqlc: SQLContext, storageLocation: String, - streamLocations: List[String]) + streamLocations: List[String], + runMode: RunMode = RunMode.Development, + checkpointDir: String = "file:///tmp/spark") extends LazyLogging { /* * We want to do processing of static data first, then continue with @@ -92,12 +105,18 @@ class HybridProcessor(sc: SparkContext, else logger.warn("could not extract number of cores from run command: " + _runCmd) - // define the formats that we can use + /// define the STORAGE sources that we can use + // a file in the local file system (must be accessible by all executors) val fileRe = """file://(.+)""".r + // a file in HDFS val hdfsRe = """(hdfs://.+)""".r + // an empty data set + val emptyRe = """^empty(.?)""".r + /// define the STREAM sources that we can use + // a Kafka message broker (host:port/topic/groupid) val kafkaRe = """kafka://([^/]+)/([^/]+)/([^/]+)$""".r + // endless dummy JSON data val dummyRe = """^dummy(.?)""".r - val emptyRe = """^empty(.?)""".r val validStaticLocations: List[Regex] = emptyRe :: fileRe :: hdfsRe :: Nil val validStreamLocations: List[Regex] = dummyRe :: kafkaRe :: Nil @@ -127,13 +146,29 @@ class HybridProcessor(sc: SparkContext, // Flag that stores whether user stopped data processing manually protected var userStoppedProcessing = false + // state of the processor + protected var _state: ProcessorState = Initialized + + protected def setState(newState: ProcessorState) = synchronized { + _state = newState + } + + def state: ProcessorState = synchronized { + _state + } + /** - * Start hybrid processing using the given transformation. + * Start hybrid processing using the given RDD[JValue] operation. * - * @param transform an RDD operation that will be performed on each batch + * The stream data will be parsed into a JValue (if possible) and the + * transformation is expected to act on the resulting RDD[JValue]. + * Note that *as opposed to* the `start(SchemaRDD => SchemaRDD)` version, + * if the input RDD is empty, the function will still be executed. + * + * @param process an RDD operation that will be performed on each batch * @return one function to stop processing and one to get the highest IDs seen so far */ - def start(transform: RDD[JValue] => RDD[_]): (() => (ProcessingInformation, ProcessingInformation), + def startJValueProcessing(process: RDD[JValue] => Unit): (() => (ProcessingInformation, ProcessingInformation), () => Option[IdType]) = { val parseJsonStringIntoOption: (String => Traversable[JValue]) = line => { val maybeJson = parseOpt(line) @@ -143,54 +178,132 @@ class HybridProcessor(sc: SparkContext, } maybeJson } - val parseAndTransform: RDD[String] => RDD[Unit] = rdd => { - transform(rdd.flatMap(parseJsonStringIntoOption)).map(_ => ()) - } - _start(parseAndTransform) + // parse DStream[String] into DStream[JValue] item by item, + // skipping unparseable strings + val parseJsonDStream = (stream: DStream[String]) => + stream.flatMap(parseJsonStringIntoOption) + val processJsonDStream: DStream[JValue] => Unit = + _.foreachRDD(process) + // start processing + _start(parseJsonDStream, processJsonDStream) } /** - * Start hybrid processing using the given transformation. + * Start hybrid processing using the given SchemaRDD operation. * - * @param transform an RDD operation that will be performed on each batch + * The stream data will be equipped with a schema (either as passed + * as a parameter or as inferred by `SQLContext.jsonRDD()`) and the + * operation is expected to act on the resulting SchemaRDD. + * Note that if the RDD is empty, the given function will not be + * executed at all (not even with an empty RDD as a parameter). + * + * @param process an RDD operation that will be performed on each batch * @return one function to stop processing and one to get the highest IDs seen so far */ - def start(transform: SchemaRDD => SchemaRDD, + def startTableProcessing(process: SchemaRDD => Unit, schema: Option[StructType]): (() => (ProcessingInformation, ProcessingInformation), () => Option[IdType]) = { - val parseAndTransform: RDD[String] => RDD[Unit] = rdd => { - // with an empty RDD, we cannot infer the schema (it will raise an exception) - if (rdd.count() > 0) { - // parse with schema or infer if not given - val jsonRdd = schema.map(sqlc.jsonRDD(rdd, _)).getOrElse(sqlc.jsonRDD(rdd, 0.1)) - transform(jsonRdd).map(_ => ()) - } else { - // create an (empty) SchemaRDD - rdd.map(_ => ()) - } + // parse DStream[String] into a row/column shaped stream + val parseJson: DStream[String] => SchemaDStream = schema match { + case Some(givenSchema) => + SchemaDStream.fromStringStreamWithSchema(sqlc, _, givenSchema, None) + case None => + SchemaDStream.fromStringStream(sqlc, _, None) + } + // We must only execute the process function if the RDD is non-empty. + // For inferred schema method, if the RDD is empty then the schema + // will be empty, too. For given schema method, we have to check + // the actual count (which is more expensive). + val processIfNotEmpty: SchemaRDD => Unit = schema match { + case Some(givenSchema) => + rdd => if (rdd.count() > 0) process(rdd) + case None => + rdd => if (rdd.schema.fields.size > 0) process(rdd) } - _start(parseAndTransform) + val processStream: SchemaDStream => Unit = + _.foreachRDD(processIfNotEmpty) + _start[SchemaDStream](parseJson, processStream) } /** - * Start hybrid processing using the given transformation. + * Start hybrid processing using the given SchemaRDD operation. * - * @param parseAndTransform an RDD operation that will be performed on each batch + * The stream data will be equipped with a schema (either as passed + * as a parameter or as inferred by `SQLContext.jsonRDD()`) and the + * operation is expected to act on the resulting SchemaDStream. + * The function is responsible for triggering output operations. + * + * @param process a function to transform and operate on the main DStream + * @return one function to stop processing and one to get the highest IDs seen so far + */ + def startTableProcessingGeneral(process: SchemaDStream => Unit, + schema: Option[StructType], + inputStreamName: String): (() => (ProcessingInformation, + ProcessingInformation), () => Option[IdType]) = { + // parse DStream[String] into a row/column shaped stream + val parseJson: DStream[String] => SchemaDStream = schema match { + case Some(givenSchema) => + SchemaDStream.fromStringStreamWithSchema(sqlc, _, givenSchema, Some(inputStreamName)) + case None => + SchemaDStream.fromStringStream(sqlc, _, Some(inputStreamName)) + } + _start[SchemaDStream](parseJson, process) + } + + /** + * Start hybrid processing using the given operation. + * + * The function passed in must operate on an RDD[String] (the stream data + * to be processed in a single batch), where each item of the RDD can be + * assumed to be JSON-encoded. The function *itself* is responsible to + * start computation (e.g. by using `rdd.foreach()` or `rdd.count()`). + * As that function can do arbitrary (nested and chained) processing, the + * notion of "number of processed items" makes only limited sense; we + * work with the "number of input items" instead. + * + * @param parseJson a function to get the input stream into something processable, + * like `DStream[String] => DStream[JValue]` or + * `DStream[String] => SchemaDStream`. "processable" means + * that there is a `foreachRDD()` method matching the + * parameter type of the `process()` function. + * (This is applied duck typing!) + * @param process the actual operations on the parsed data stream. Note that + * this function is responsible for calling an output operation. + * @tparam T the type of RDD that the parsed stream will allow processing on, + * like `RDD[JValue]` or `SchemaRDD` * @return one function to stop processing and one to get the highest IDs seen so far */ - protected def _start(parseAndTransform: RDD[String] => RDD[Unit]): + protected def _start[T](parseJson: DStream[String] => T, + process: T => Unit): (() => (ProcessingInformation, ProcessingInformation), () => Option[IdType]) = { + if (state != Initialized) { + val msg = "processor cannot be started in state " + state + logger.error(msg) + throw new RuntimeException(msg) + } + setState(Running) logger.debug("creating StreamingContext for static data") + /* In order for updateStreamByKey() to work, we need to enable RDD checkpointing + * by setting a checkpoint directory. Note that this is different from enabling + * Streaming checkpointing (which would be needed for driver fault-tolerance), + * which would require the whole state of the application (in particular, all + * functions in stream.foreachRDD(...) calls) to be serializable. This would + * mean a rewrite of large parts of code, if it is possible at all. + * Also see . + */ + sc.setCheckpointDir(checkpointDir) ssc_ = new StreamingContext(sc, Seconds(2)) // this has to match our jubaql_timestamp inserted by fluentd - val extractRe = """.+"jubaql_timestamp": ?"([0-9\-:.T]+)".*""".r + val timestampInJsonRe = """ *"jubaql_timestamp": ?"([0-9\-:.T]+)" *""".r + // Extract a jubaql_timestamp field from a JSON-shaped string and return it. val extractId: String => IdType = item => { - item match { - case extractRe(idString) => - idString - case _ => + timestampInJsonRe.findFirstMatchIn(item) match { + case Some(aMatch) => + val id = aMatch.group(1) + id + case None => "" } } @@ -257,13 +370,15 @@ class HybridProcessor(sc: SparkContext, logger.debug("not repartitioning") staticData } + // first find the maximal ID in the data and count it repartitionedData.map(item => { + val id = extractId(item) // update maximal ID - maxStaticId += Some(extractId(item)) - item - }).transform(parseAndTransform).foreachRDD(rdd => { + maxStaticId += Some(id) + id + }).foreachRDD(rdd => { val count = rdd.count() - // we count the number of total processed rows (on the driver) + // we count the number of total input rows (on the driver) countStatic += count // stop processing of static data if there are no new files if (count == 0) { @@ -275,6 +390,9 @@ class HybridProcessor(sc: SparkContext, logger.info(s"processed $count (static) lines") } }) + // now do the actual processing + val mainStream = parseJson(repartitionedData) + process(mainStream) // start first StreamingContext logger.info("starting static data processing") @@ -315,7 +433,7 @@ class HybridProcessor(sc: SparkContext, } else { logger.warn("static data processing ended, but did not complete") } - staticStreamingContext.stop(false, true) + staticStreamingContext.stop(stopSparkContext = false, stopGracefully = true) logger.debug("bye from thread to wait for completion of static processing") } onFailure { case error: Throwable => @@ -326,7 +444,6 @@ class HybridProcessor(sc: SparkContext, future { // NB. This is a separate thread. In functions that will be serialized, // you cannot necessarily use variables from outside this thread. - // Also see . val localExtractId = extractId val localCountStream = countStream val localMaxStreamId = maxStreamId @@ -340,6 +457,16 @@ class HybridProcessor(sc: SparkContext, staticRunTime = System.currentTimeMillis() - staticStartTime logger.debug("static processing ended after %d items and %s ms, largest seen ID: %s".format( countStatic.value, staticRunTime, largestStaticItemId)) + logger.debug("sleeping a bit to allow Spark to settle") + runMode match { + case Development => + Thread.sleep(200) + case _ => + // If we don't sleep long enough here, then old/checkpointed RDDs + // won't be cleaned up in time before the next process starts. For + // some reason, this happens only with YARN. + Thread.sleep(8000) + } if (staticProcessingComplete && !userStoppedProcessing) { logger.info("static processing completed successfully, setting up stream") streamLocations match { @@ -347,11 +474,15 @@ class HybridProcessor(sc: SparkContext, // set up stream processing logger.debug("creating StreamingContext for stream data") ssc_ = new StreamingContext(sc, Seconds(2)) - val allStreamData: DStream[String] = streamLocation match { + val allStreamData: DStream[(IdType, String)] = (streamLocation match { case dummyRe(nothing) => - // dummy JSON data emitted over and over - val dummyData = sc.parallelize("{\"id\": 5}" :: "{\"id\": 6}" :: - "{\"id\": 7}" :: Nil) + // dummy JSON data emitted over and over (NB. the timestamp + // is not increasing over time) + val dummyData = sc.parallelize( + """{"gender":"m","age":26,"jubaql_timestamp":"2014-11-21T15:52:21.943321112"}""" :: + """{"gender":"f","age":24,"jubaql_timestamp":"2014-11-21T15:52:22"}""" :: + """{"gender":"m","age":31,"jubaql_timestamp":"2014-11-21T15:53:21.12345"}""" :: + Nil) new ConstantInputDStream(ssc_, dummyData) case kafkaRe(zookeeper, topics, groupId) => // connect to the given Kafka instance and receive data @@ -366,38 +497,41 @@ class HybridProcessor(sc: SparkContext, // left for broadcast variables, so we cannot communicate // our "runState = false" information. StorageLevel.DISK_ONLY).map(_._2) - } + }).map(item => (localExtractId(item), item)) val streamData = (largestStaticItemId match { case Some(largestId) => // only process items with a strictly larger id than what we // have seen so far logger.info("filtering for items with an id larger than " + largestId) - allStreamData.filter(item => { - localExtractId(item) > largestId + allStreamData.filter(itemWithId => { + itemWithId._1 > largestId }) case None => // don't do any ID filtering if there is no "largest id" logger.info("did not see any items in static processing, " + "processing whole stream") allStreamData - }).map(item => { + }).map(itemWithId => { // remember the largest seen ID - localMaxStreamId += Some(localExtractId(item)) - item + localMaxStreamId += Some(itemWithId._1) + itemWithId._2 }) logger.debug("stream data DStream: " + streamData) - streamData.transform(parseAndTransform).foreachRDD(rdd => { - // this `count` is *necessary* to trigger the (lazy) transformation! + streamData.foreachRDD(rdd => { val count = rdd.count() // we count the number of total processed rows (on the driver) localCountStream += count logger.info(s"processed $count (stream) lines") }) + // now do the actual processing + val mainStream = parseJson(streamData) + process(mainStream) // start stream processing synchronized { if (userStoppedProcessing) { logger.info("processing was stopped by user during stream setup, " + "not starting") + setState(Finished) } else { logger.info("starting stream processing") streamStartTime = System.currentTimeMillis() @@ -407,21 +541,26 @@ class HybridProcessor(sc: SparkContext, case Nil => logger.info("not starting stream processing " + "(no stream source given)") + setState(Finished) case _ => logger.error("not starting stream processing " + "(multiple streams not implemented)") + setState(Finished) } } else if (staticProcessingComplete && userStoppedProcessing) { logger.info("static processing was stopped by user, " + "not setting up stream") + setState(Finished) } else { logger.warn("static processing did not complete successfully, " + "not setting up stream") + setState(Finished) } logger.debug("bye from thread to start stream processing") } onFailure { case error: Throwable => logger.error("Error while setting up stream processing", error) + setState(Finished) } // return a function to stop the data processing @@ -431,7 +570,7 @@ class HybridProcessor(sc: SparkContext, userStoppedProcessing = true } logger.debug("now stopping the StreamingContext") - currentStreamingContext.stop(false, true) + currentStreamingContext.stop(stopSparkContext = false, stopGracefully = true) logger.debug("done stopping the StreamingContext") // if stream processing was not started or there was a runtime already // computed, we don't update the runtime @@ -441,13 +580,31 @@ class HybridProcessor(sc: SparkContext, logger.info(("processed %s items in %s ms (static) and %s items in " + "%s ms (stream)").format(countStatic.value, staticRunTime, countStream.value, streamRunTime)) + setState(Finished) (ProcessingInformation(countStatic.value, staticRunTime, maxStaticId.value), ProcessingInformation(countStream.value, streamRunTime, maxStreamId.value)) }, () => maxStaticId.value) } + /** + * Allows the user to wait for termination of the processing. + * If an exception happens during processing, an exception will be thrown here. + */ def awaitTermination() = { - ssc_.awaitTermination() + logger.debug("user is waiting for termination ...") + try { + ssc_.awaitTermination() + setState(Finished) + } catch { + case e: Throwable => + logger.warn("StreamingContext threw an exception (\"%s\"), shutting down".format( + e.getMessage)) + // when we got an exception, clean up properly + ssc_.stop(stopSparkContext = false, stopGracefully = true) + setState(Finished) + logger.info(s"streaming context was stopped after exception") + throw e + } } } diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptHelpers.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptHelpers.scala new file mode 100644 index 0000000..dbc4155 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptHelpers.scala @@ -0,0 +1,150 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import javax.mail.internet.{InternetAddress, MimeMessage} +import javax.mail.{Message, Session, Transport} + +import dispatch.Defaults._ +import dispatch._ + +import scala.collection.JavaConversions._ +import scala.util.{Failure, Success, Try} + +object JavaScriptHelpers { + def test(): String = { + "test" + } + + val h = Http() + + /** + * Make a simple HTTP GET request. + * + * @param where URL to query + * @return response body + */ + def httpGet(where: String): Try[String] = { + val r = url(where) + makeRequest(r) + } + + /** + * Make a simple HTTP GET request with URL parameters. + * + * @param where URL to query + * @param params a key-value JavaScript object (will be sent as URL + * parameters) + * @return response body + */ + def httpGet(where: String, params: java.util.Map[_, _]): Try[String] = { + val urlParams = stringifyMap(params) + val r = url(where) < + Success("Mail sent") + case Failure(err) => + println(err) + Failure(err) + } + } + + /** + * Perform a blocking request and return the result as a Try-wrapped string. + */ + protected def makeRequest(req: Req): Try[String] = { + h(req OK as.String).either.map(_ match { + case Left(err) => Failure(err) + case Right(s) => Success(s) + }).apply() + } + + protected def stringifyMap(obj: java.util.Map[_, _]): Map[String, String] = { + (obj.toList.map { case (key, value) => + (key.toString, value match { + case s: String => + s + case d: java.lang.Double if d.toString.endsWith(".0") => + d.toInt.toString + case other => other.toString + }) + }).toMap + } + + def javaScriptToScala(s: String) = s + def javaScriptToScala(x: Double) = x +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptUDFManager.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptUDFManager.scala new file mode 100644 index 0000000..1b2af8b --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JavaScriptUDFManager.scala @@ -0,0 +1,131 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import scala.collection.mutable +import scala.collection.JavaConversions +import javax.script.{ScriptEngine, ScriptEngineManager, Invocable} + +import scala.util.{Failure, Success, Try} + +class JavaScriptUDFManager { + // The null is required. + // See: http://stackoverflow.com/questions/20168226/sbt-0-13-scriptengine-is-null-for-getenginebyname-javascript + private val scriptEngineManager = new ScriptEngineManager(null) + + private val jsEngines = new ThreadLocal[ScriptEngine] { + override def initialValue() = createScriptEngine() + } + + private case class Mapped(nargs: Int, funcBody: String, var threadIds: List[Long]) + private val funcs = new mutable.HashMap[String, Mapped] + + // throws javax.script.ScriptException when funcBody is invalid. + def register(funcName: String, nargs: Int, funcBody: String): Unit = { + val engine = getScriptEngine() + val threadId = Thread.currentThread.getId + + funcs.synchronized { + def overwriteFunc(): Unit = { + funcs += (funcName -> Mapped(nargs, funcBody, List(threadId))) + } + + funcs.get(funcName) match { + case None => + overwriteFunc() + case Some(m) if funcBody != m.funcBody => + overwriteFunc() + + case Some(m) => + if (m.threadIds.contains(threadId)) + return + m.threadIds = threadId :: m.threadIds + } + engine.eval(funcBody) + } + } + + private def invoke(funcName: String, args: AnyRef*): AnyRef = { + val inv = getInvocableEngine() + inv.invokeFunction(funcName, args: _*) + } + + def call[T](funcName: String, args: AnyRef*): Option[T] = { + Try { + invoke(funcName, args:_*).asInstanceOf[T] + } match { + case Success(value) => Some(value) + case Failure(err) => None + } + } + + def tryCall[T](funcName: String, args: AnyRef*): Try[T] = Try { + invoke(funcName, args:_*).asInstanceOf[T] + } + + def registerAndCall[T](funcName: String, nargs: Int, funcBody: String, args: AnyRef*): Option[T] = { + register(funcName, nargs, funcBody) + call[T](funcName, args:_*) + } + + def registerAndTryCall[T](funcName: String, nargs: Int, funcBody: String, args: AnyRef*): Try[T] = { + register(funcName, nargs, funcBody) + tryCall[T](funcName, args:_*) + } + + def getNumberOfArgsByFunctionName(fname: String): Option[Int] = funcs.synchronized { + funcs.get(fname).map(_.nargs) + } + + // This method is required because Rhino may return ConsString (!= java.lang.String) + def asScala(x: AnyRef) = { + val inv = getInvocableEngine + inv.invokeMethod(JavaScriptHelpers, "javaScriptToScala", x) + } + + private def getScriptEngine(): ScriptEngine = jsEngines.get + + private def getInvocableEngine(): Invocable = { + getScriptEngine().asInstanceOf[Invocable] + } + + private def createScriptEngine(): ScriptEngine = { + var engine: ScriptEngine = null + scriptEngineManager.synchronized { + engine = scriptEngineManager.getEngineByName("JavaScript") + } + if (engine == null) { + val threadId = Thread.currentThread.getId + throw new Exception("failed to create JavaScript engine in thread %d".format(threadId)) + } + engine.put("jql", JavaScriptHelpers) + + engine + } +} + +object JavaScriptUDFManager extends JavaScriptUDFManager + +object JavaScriptFeatureFunctionManager extends JavaScriptUDFManager { + def callAndGetValues(funcName: String, args: AnyRef*): Map[String, Any] = { + tryCall[java.util.Map[String, AnyRef]](funcName, args:_*) match { + case Success(m) => + JavaConversions.mapAsScalaMap(m).toMap.mapValues(asScala) + case Failure(err) => + throw err + } + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala index eb36315..5bf3690 100644 --- a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLAST.scala @@ -15,8 +15,10 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA package us.jubat.jubaql_server.processor +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.expressions.Expression -sealed abstract trait JubaQLAST +sealed trait JubaQLAST case class CreateDatasource(sourceName: String, @@ -27,21 +29,49 @@ CreateDatasource(sourceName: String, case class CreateModel(algorithm: String, modelName: String, - configJson: String, - specifier: List[(String, List[String])]) extends JubaQLAST { - override def toString: String = "CreateModel(%s,%s,%s,%s)".format( + labelOrId: Option[(String, String)], + featureExtraction: List[(FeatureFunctionParameters, String)], + configJson: String) extends JubaQLAST { + override def toString: String = "CreateModel(%s,%s,%s,%s,%s)".format( algorithm, modelName, + labelOrId, + featureExtraction, if (configJson.size > 13) configJson.take(5) + "..." + configJson.takeRight(5) - else configJson, - specifier + else configJson ) } case class Update(modelName: String, rpcName: String, source: String) extends JubaQLAST +case class CreateStreamFromSelect(streamName: String, selectPlan: LogicalPlan) extends JubaQLAST + +case class CreateStreamFromAnalyze(streamName: String, analyze: Analyze, newColumn: Option[String]) extends JubaQLAST + +case class CreateTrigger(dsName: String, condition: Option[Expression], expr: Expression) extends JubaQLAST + +case class CreateStreamFromSlidingWindow(streamName: String, windowSize: Int, slideInterval: Int, + windowType: String, source: LogicalPlan, + funcSpecs: List[(String, List[Expression], Option[String])], + postCond: Option[Expression]) extends JubaQLAST + case class Analyze(modelName: String, rpcName: String, data: String) extends JubaQLAST +case class LogStream(streamName: String) extends JubaQLAST + +case class Status() extends JubaQLAST + case class Shutdown() extends JubaQLAST +case class StartProcessing(dsName: String) extends JubaQLAST + case class StopProcessing() extends JubaQLAST + +case class CreateFunction(funcName: String, args: List[(String, String)], + returnType: String, lang: String, body: String) extends JubaQLAST + +case class CreateFeatureFunction(funcName: String, args: List[(String, String)], + lang: String, body: String) extends JubaQLAST + +case class CreateTriggerFunction(funcName: String, args: List[(String, String)], + lang: String, body: String) extends JubaQLAST diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierPrediction.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLContext.scala similarity index 79% rename from processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierPrediction.scala rename to processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLContext.scala index 89ffe04..19fa6e4 100644 --- a/processor/src/main/scala/us/jubat/jubaql_server/processor/json/ClassifierPrediction.scala +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLContext.scala @@ -13,6 +13,9 @@ // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA -package us.jubat.jubaql_server.processor.json +package us.jubat.jubaql_server.processor -case class ClassifierPrediction(label: String, score: Double) +import org.apache.spark.SparkContext +import org.apache.spark.sql.SQLContext + +class JubaQLContext(sc: SparkContext, @transient val parser: JubaQLParser) extends SQLContext(sc) diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala index 64c9a5d..6156c1c 100644 --- a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLParser.scala @@ -15,12 +15,63 @@ // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA package us.jubat.jubaql_server.processor -import org.apache.spark.sql.catalyst.SqlParser +import org.apache.spark.sql.catalyst.types.BooleanType +import org.apache.spark.sql.catalyst.{SqlLexical, SqlParser} +import org.apache.spark.sql.catalyst.analysis.{Star, UnresolvedAttribute, UnresolvedRelation} +import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import com.typesafe.scalalogging.slf4j.LazyLogging +import scala.util.parsing.input.CharArrayReader._ + +// TODO: move these to a proper file. +// TODO: rename to better ones. +sealed trait FeatureFunctionParameters + +case object WildcardAnyParameter extends FeatureFunctionParameters + +case class WildcardWithPrefixParameter(prefix: String) extends FeatureFunctionParameters + +case class WildcardWithSuffixParameter(suffix: String) extends FeatureFunctionParameters + +case class NormalParameters(params: List[String]) extends FeatureFunctionParameters + + class JubaQLParser extends SqlParser with LazyLogging { + class JubaQLLexical(keywords: Seq[String]) extends SqlLexical(keywords) { + case class CodeLit(chars: String) extends Token { + override def toString = "$$"+chars+"$$" + } + + // used for parsing $$-delimited code blocks + protected lazy val codeDelim: Parser[String] = '$' ~ '$' ^^ + { case a ~ b => "$$" } + + protected lazy val stringWithoutCodeDelim: Parser[String] = rep1( chrExcept('$', EofCh) ) ^^ + { case chars => chars mkString "" } + + protected lazy val codeContents: Parser[String] = repsep(stringWithoutCodeDelim, '$') ^^ + { case words => words mkString "$" } + + override lazy val token: Parser[Token] = + ( identChar ~ rep( identChar | digit ) ^^ { case first ~ rest => processIdent(first :: rest mkString "") } + | rep1(digit) ~ opt('.' ~> rep(digit)) ^^ { + case i ~ None => NumericLit(i mkString "") + case i ~ Some(d) => FloatLit(i.mkString("") + "." + d.mkString("")) + } + | '\'' ~ rep( chrExcept('\'', EofCh) ) ~ '\'' ^^ { case '\'' ~ chars ~ '\'' => StringLit(chars mkString "") } + | '\"' ~ rep( chrExcept('\"', EofCh) ) ~ '\"' ^^ { case '\"' ~ chars ~ '\"' => StringLit(chars mkString "") } + | codeDelim ~> codeContents <~ codeDelim ^^ { case chars => CodeLit(chars) } + | EofCh ^^^ EOF + | codeDelim ~> failure("unclosed code literal") + | '\'' ~> failure("unclosed string literal") + | '\"' ~> failure("unclosed string literal") + | delim + | failure("illegal character") + ) + } + protected lazy val CREATE = Keyword("CREATE") protected lazy val DATASOURCE = Keyword("DATASOURCE") protected lazy val MODEL = Keyword("MODEL") @@ -32,19 +83,70 @@ class JubaQLParser extends SqlParser with LazyLogging { protected lazy val ANALYZE = Keyword("ANALYZE") protected lazy val USING = Keyword("USING") protected lazy val DATA = Keyword("DATA") + protected lazy val LOG = Keyword("LOG") protected lazy val STORAGE = Keyword("STORAGE") protected lazy val STREAM = Keyword("STREAM") - protected lazy val config = Keyword("config") + protected lazy val CONFIG = Keyword("CONFIG") protected lazy val numeric = Keyword("numeric") protected lazy val string = Keyword("string") protected lazy val boolean = Keyword("boolean") + protected lazy val STATUS = Keyword("STATUS") protected lazy val SHUTDOWN = Keyword("SHUTDOWN") + protected lazy val START = Keyword("START") protected lazy val STOP = Keyword("STOP") protected lazy val PROCESSING = Keyword("PROCESSING") + protected lazy val FUNCTION = Keyword("FUNCTION") + protected lazy val RETURNS = Keyword("RETURNS") + protected lazy val LANGUAGE = Keyword("LANGUAGE") + protected lazy val FEATURE = Keyword("FEATURE") + protected lazy val TRIGGER = Keyword("TRIGGER") + protected lazy val FOR = Keyword("FOR") + protected lazy val EACH = Keyword("EACH") + protected lazy val ROW = Keyword("ROW") + protected lazy val EXECUTE = Keyword("EXECUTE") + protected lazy val SLIDING = Keyword("SLIDING") + protected lazy val WINDOW = Keyword("WINDOW") + protected lazy val SIZE = Keyword("SIZE") + protected lazy val ADVANCE = Keyword("ADVANCE") + protected lazy val TIME = Keyword("TIME") + protected lazy val TUPLES = Keyword("TUPLES") + protected lazy val OVER = Keyword("OVER") + + override val lexical = new JubaQLLexical(reservedWords) + + // we should allow some common column names that have are also known as keywords + protected lazy val colIdent = (COUNT | TIME | STATUS | MODEL | GROUP | + ORDER | ident) + + override lazy val baseExpression: PackratParser[Expression] = + expression ~ "[" ~ expression <~ "]" ^^ { + case base ~ _ ~ ordinal => GetItem(base, ordinal) + } | + TRUE ^^^ Literal(true, BooleanType) | + FALSE ^^^ Literal(false, BooleanType) | + cast | + "(" ~> expression <~ ")" | + function | + "-" ~> literal ^^ UnaryMinus | + colIdent ^^ UnresolvedAttribute | // was: ident + "*" ^^^ Star(None) | + literal + + override lazy val projection: Parser[Expression] = + expression ~ (opt(AS) ~> opt(colIdent)) ^^ { // was: opt(ident) + case e ~ None => e + case e ~ Some(a) => Alias(e, a)() + } + + protected lazy val streamIdent = ident + + protected lazy val modelIdent = ident + + protected lazy val funcIdent = ident // column_name column_type protected lazy val stringPairs: Parser[(String, String)] = { - ident ~ (numeric | string | boolean) ^^ { + colIdent ~ (numeric | string | boolean) ^^ { case x ~ y => (x, y) } } @@ -62,7 +164,7 @@ class JubaQLParser extends SqlParser with LazyLogging { // CREATE DATASOURCE source_name ( column_name data_type, [...]) FROM sink_id protected lazy val createDatasource: Parser[JubaQLAST] = { - CREATE ~ DATASOURCE ~> ident ~ opt("(" ~ rep1sep(stringPairs, ",") ~ ")") ~ + CREATE ~ DATASOURCE ~> streamIdent ~ opt("(" ~ rep1sep(stringPairs, ",") ~ ")") ~ FROM ~ "(" ~ STORAGE ~ ":" ~ stringLit ~ opt(streamList) <~ ")" ^^ { case sourceName ~ rep ~ _ /*FROM*/ ~ _ ~ _ /*STORAGE*/ ~ _ ~ storage ~ streams => rep match { @@ -80,64 +182,141 @@ class JubaQLParser extends SqlParser with LazyLogging { } } - protected lazy val createWith: Parser[(String, List[String])] = { - ident ~ ":" ~ stringLit ^^ { - case key ~ _ ~ value => - (key, List(value)) - } | - ident ~ ":" ~ "[" ~ rep1sep(stringLit, ",") <~ "]" ^^ { - case key ~ _ ~ _ ~ values => - (key, values) - } + protected lazy val createModel: Parser[JubaQLAST] = { + val wildcardAny: Parser[FeatureFunctionParameters] = "*" ^^ { + case _ => + WildcardAnyParameter + } + val wildcardWithPrefixParam: Parser[FeatureFunctionParameters] = ident <~ "*" ^^ { + case prefix => + WildcardWithPrefixParameter(prefix) + } + val wildcardWithSuffixParam: Parser[FeatureFunctionParameters] = "*" ~> ident ^^ { + case suffix => + WildcardWithSuffixParameter(suffix) + } + // wildcardWithSuffixParam is first. + // If wildcardAny precedes, *_suffix always matches to wildcardAny. + val wildcard: Parser[FeatureFunctionParameters] = wildcardWithSuffixParam | wildcardAny | wildcardWithPrefixParam + + val oneParameter: Parser[NormalParameters] = colIdent ^^ { + case param => + NormalParameters(List(param)) + } + // this may take one parameter. Should such behavior avoided? + val moreThanOneParameters: Parser[FeatureFunctionParameters] = "(" ~> rep1sep(colIdent, ",") <~ ")" ^^ { + case params => + NormalParameters(params) + } + + val featureFunctionParameters: Parser[FeatureFunctionParameters] = wildcard | oneParameter | moreThanOneParameters + + val labelOrId: Parser[(String, String)] = "(" ~> ident ~ ":" ~ colIdent <~ ")" ^^ { + case labelOrId ~ _ ~ value if labelOrId == "label" || labelOrId == "id" => + (labelOrId, value) + } + + val paramsAndFunction: Parser[(FeatureFunctionParameters, String)] = featureFunctionParameters ~ opt(WITH ~> funcIdent) ^^ { + case params ~ functionName => + (params, functionName.getOrElse("id")) + } + + CREATE ~> jubatusAlgorithm ~ MODEL ~ modelIdent ~ opt(labelOrId) ~ AS ~ + rep1sep(paramsAndFunction, ",") ~ CONFIG ~ stringLit ^^ { + case algorithm ~ _ ~ modelName ~ maybeLabelOrId ~ _ ~ l ~ _ ~ config => + CreateModel(algorithm, modelName, maybeLabelOrId, l, config) + } } - // CREATE algorithm_name MODEL jubatus_name WITH config = "json string" - protected lazy val createModel: Parser[JubaQLAST] = { - CREATE ~> jubatusAlgorithm ~ MODEL ~ ident ~ WITH ~ "(" ~ opt(rep1sep(createWith, ",")) ~ ")" ~ "config" ~ "=" ~ stringLit ^^ { - case algorithm ~ _ ~ modelName ~ _ /*with*/ ~ _ ~ cwith ~ _ ~ _ /*config*/ ~ _ ~ config => - CreateModel(algorithm, modelName, config, cwith.getOrElse(List[(String, List[String])]())) - } - } - - // This select copied from SqlParser, and removed `from` clause. - protected lazy val jubaqlSelect: Parser[LogicalPlan] = - SELECT ~> opt(DISTINCT) ~ projections ~ - opt(filter) ~ - opt(grouping) ~ - opt(having) ~ - opt(orderBy) ~ - opt(limit) <~ opt(";") ^^ { - case d ~ p ~ f ~ g ~ h ~ o ~ l => - val base = NoRelation + protected lazy val createStreamFromSelect: Parser[JubaQLAST] = { + CREATE ~ STREAM ~> streamIdent ~ FROM ~ select ^^ { + case streamName ~ _ ~ selectPlan => + CreateStreamFromSelect(streamName, selectPlan) + } + } + + protected lazy val createStreamFromAnalyze: Parser[JubaQLAST] = { + CREATE ~ STREAM ~> streamIdent ~ FROM ~ analyzeStream ~ opt(AS ~> colIdent) ^^ { + case streamName ~ _ ~ analyzePlan ~ newColumn => + CreateStreamFromAnalyze(streamName, analyzePlan, newColumn) + } + } + + protected lazy val createTrigger: Parser[JubaQLAST] = { + CREATE ~ TRIGGER ~ ON ~> streamIdent ~ FOR ~ EACH ~ ROW ~ opt(WHEN ~> expression) ~ EXECUTE ~ function ^^ { + case dsName ~ _ ~ _ ~ _ ~ condition ~ _ ~ expr => + CreateTrigger(dsName, condition, expr) + } + } + + protected lazy val createStreamFromSlidingWindow: Parser[JubaQLAST] = { + val aggregation: Parser[(String, List[Expression], Option[String])] = + (ident | AVG) ~ "(" ~ rep1sep(expression, ",") ~ ")" ~ opt(AS ~> colIdent) ^^ { + case funcName ~ _ ~ parameters ~ _ ~ maybeAlias => + (funcName, parameters, maybeAlias) + } + val aggregationList = rep1sep(aggregation, ",") + + val filter: Parser[Expression] = WHERE ~ expression ^^ { case _ ~ e => e} + val having: Parser[Expression] = HAVING ~> expression + + CREATE ~ STREAM ~> streamIdent ~ FROM ~ SLIDING ~ WINDOW ~ + "(" ~ SIZE ~ numericLit ~ ADVANCE ~ numericLit ~ (TIME | TUPLES) ~ ")" ~ + OVER ~ streamIdent ~ WITH ~ aggregationList ~ opt(filter) ~ opt(having) ^^ { + case streamName ~ _ ~ _ ~ _ ~ _ ~ _ /* FROM SLIDING WINDOW ( SIZE */ ~ + size ~ _ /* ADVANCE */ ~ advance ~ windowType ~ _ /* ) */ ~ + _ /* OVER */ ~ source ~ _ /* WITH */ ~ funcSpecs ~ f ~ h => + // start from a table/stream with the given name + val base = UnresolvedRelation(Seq(source), None) + // apply the precondition val withFilter = f.map(f => Filter(f, base)).getOrElse(base) - val withProjection = - g.map { - g => - Aggregate(g, assignAliases(p), withFilter) - }.getOrElse(Project(assignAliases(p), withFilter)) - val withDistinct = d.map(_ => Distinct(withProjection)).getOrElse(withProjection) - val withHaving = h.map(h => Filter(h, withDistinct)).getOrElse(withDistinct) - val withOrder = o.map(o => Sort(o, withHaving)).getOrElse(withHaving) - val withLimit = l.map { - l => Limit(l, withOrder) - }.getOrElse(withOrder) - withLimit + // select only the column that we use in the window. + val allColumns = funcSpecs.map(_._2.last) + val withProjection = Project(assignAliases(allColumns), withFilter) + // NB. we have to add a Cast to the correct type in every column later, + // after we have mapped function names to concrete functions. + + CreateStreamFromSlidingWindow(streamName, size.toInt, advance.toInt, + windowType.toLowerCase, withProjection, funcSpecs, + h) } + } + + protected lazy val logStream: Parser[JubaQLAST] = { + LOG ~ STREAM ~> streamIdent ^^ { + case streamName => + LogStream(streamName) + } + } protected lazy val update: Parser[JubaQLAST] = { - UPDATE ~ MODEL ~> ident ~ USING ~ ident ~ FROM ~ ident ^^ { + UPDATE ~ MODEL ~> modelIdent ~ USING ~ funcIdent ~ FROM ~ streamIdent ^^ { case modelName ~ _ ~ rpcName ~ _ ~ source => Update(modelName, rpcName, source) } } protected lazy val analyze: Parser[JubaQLAST] = { - ANALYZE ~> stringLit ~ BY ~ MODEL ~ ident ~ USING ~ ident ^^ { + ANALYZE ~> stringLit ~ BY ~ MODEL ~ modelIdent ~ USING ~ funcIdent ^^ { case data ~ _ ~ _ ~ modelName ~ _ ~ rpc => Analyze(modelName, rpc, data) } } + protected lazy val analyzeStream: Parser[Analyze] = { + ANALYZE ~> streamIdent ~ BY ~ MODEL ~ modelIdent ~ USING ~ funcIdent ^^ { + case source ~ _ ~ _ ~ modelName ~ _ ~ rpc => + Analyze(modelName, rpc, source) + } + } + + protected lazy val status: Parser[JubaQLAST] = { + STATUS ^^ { + case _ => + Status() + } + } + protected lazy val shutdown: Parser[JubaQLAST] = { SHUTDOWN ^^ { case _ => @@ -145,6 +324,13 @@ class JubaQLParser extends SqlParser with LazyLogging { } } + protected lazy val startProcessing: Parser[JubaQLAST] = { + START ~ PROCESSING ~> streamIdent ^^ { + case dsName => + StartProcessing(dsName) + } + } + protected lazy val stopProcessing: Parser[JubaQLAST] = { STOP ~> PROCESSING ^^ { case _ => @@ -152,24 +338,66 @@ class JubaQLParser extends SqlParser with LazyLogging { } } + /** A parser which matches a code literal */ + def codeLit: Parser[String] = + elem("code literal", _.isInstanceOf[lexical.CodeLit]) ^^ (_.chars) + + protected lazy val createFunction: Parser[JubaQLAST] = { + CREATE ~ FUNCTION ~> funcIdent ~ "(" ~ repsep(stringPairs, ",") ~ ")" ~ + RETURNS ~ (numeric | string| boolean) ~ LANGUAGE ~ ident ~ AS ~ codeLit ^^ { + case f ~ _ ~ args ~ _ ~ _ /*RETURNS*/ ~ retType ~ _ /*LANGUAGE*/ ~ lang ~ + _ /*AS*/ ~ body => + CreateFunction(f, args, retType, lang, body) + } + } + + protected lazy val createFeatureFunction: Parser[JubaQLAST] = { + CREATE ~ FEATURE ~ FUNCTION ~> funcIdent ~ "(" ~ repsep(stringPairs, ",") ~ ")" ~ + LANGUAGE ~ ident ~ AS ~ codeLit ^^ { + case f ~ _ ~ args ~ _ ~ _ /*LANGUAGE*/ ~ lang ~ + _ /*AS*/ ~ body => + CreateFeatureFunction(f, args, lang, body) + } + } + + protected lazy val createTriggerFunction: Parser[JubaQLAST] = { + CREATE ~ TRIGGER ~ FUNCTION ~> funcIdent ~ "(" ~ repsep(stringPairs, ",") ~ ")" ~ + LANGUAGE ~ ident ~ AS ~ codeLit ^^ { + case f ~ _ ~ args ~ _ ~ _ /*LANGUAGE*/ ~ lang ~ + _ /*AS*/ ~ body => + CreateTriggerFunction(f, args, lang, body) + } + } + protected lazy val jubaQLQuery: Parser[JubaQLAST] = { createDatasource | createModel | + createStreamFromSelect | + createStreamFromSlidingWindow | + createStreamFromAnalyze | + createTrigger | + logStream | update | analyze | + status | shutdown | - stopProcessing + startProcessing | + stopProcessing | + createFunction | + createFeatureFunction | + createTriggerFunction } // note: apply cannot override incompatible type with parent class //override def apply(input: String): Option[JubaQLAST] = { def parse(input: String): Option[JubaQLAST] = { + logger.info(s"trying to parse '$input'") phrase(jubaQLQuery)(new lexical.Scanner(input)) match { case Success(r, q) => - logger.debug(s"successfully parsed '$input' into $r") + logger.debug(s"successfully parsed input: $r") Option(r) case x => - logger.warn(s"failed to parse '$input' as JubaQL") + logger.warn(s"failed to parse input as JubaQL: $x") None } } diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLPatternLayout.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLPatternLayout.scala new file mode 100644 index 0000000..fc8d759 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLPatternLayout.scala @@ -0,0 +1,46 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import java.net.InetAddress + +import org.apache.log4j.PatternLayout +import org.apache.log4j.helpers.{PatternConverter, PatternParser} +import org.apache.log4j.spi.LoggingEvent + +class JubaQLPatternLayout extends PatternLayout { + val hostname = InetAddress.getLocalHost().getHostName + + override def createPatternParser(pattern: String): PatternParser = { + new PatternParser(pattern) { + override def finalizeConverter(c: Char): Unit = { + c match { + // add a new 'h' pattern to the conversion string + case 'h' => + val pc = new PatternConverter { + override def convert(event: LoggingEvent): String = { + hostname + } + } + addConverter(pc) + // all other characters are handled by the original pattern parser + case other => + super.finalizeConverter(other) + } + } + } + } +} diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala index 67a5c16..c6a3ee9 100644 --- a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLProcessor.scala @@ -20,7 +20,7 @@ import java.net.InetAddress import com.typesafe.scalalogging.slf4j.LazyLogging import org.apache.spark.SparkContext import com.twitter.finagle.{Http, Service} -import com.twitter.util.Await +import com.twitter.util.{Duration, Time, Await} import org.jboss.netty.handler.codec.http._ import sun.misc.{SignalHandler, Signal} @@ -63,6 +63,13 @@ object JubaQLProcessor extends LazyLogging { } logger.debug(s"Starting JubaQLProcessor in run mode $runMode") + // checkpointDir for Spark + val checkpointDir = scala.util.Properties.propOrElse("jubaql.checkpointdir", "") + if (checkpointDir.trim.isEmpty) { + logger.error("No jubaql.checkpointdir property") + System.exit(1) + } + // When run through spark-submit, the Java system property "spark.master" // will contain the master passed to spark-submit and we *must* use the // same; otherwise use "local[3]". @@ -73,7 +80,7 @@ object JubaQLProcessor extends LazyLogging { val sc = new SparkContext(master, "JubaQL Processor") // start HTTP interface - val service: Service[HttpRequest, HttpResponse] = new JubaQLService(sc, runMode) + val service: Service[HttpRequest, HttpResponse] = new JubaQLService(sc, runMode, checkpointDir) val errorHandler = new HandleExceptions logger.info("JubaQLProcessor HTTP server starting") val server = Http.serve(":*", errorHandler andThen service) @@ -101,7 +108,10 @@ object JubaQLProcessor extends LazyLogging { unregister(regHandler) isRegistered = false } - Await.result(server.close()) + // close HTTP server only after a short timeout to finish requests + // (otherwise sometimes the response to a SHUTDOWN command won't + // arrive at the client) + Await.result(server.close(Time.now + Duration.fromSeconds(5))) } } diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala index 184ab5f..ed80a19 100644 --- a/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/JubaQLService.scala @@ -16,60 +16,104 @@ package us.jubat.jubaql_server.processor import java.net.InetAddress +import java.text.SimpleDateFormat +import java.util.Date import java.util.concurrent.ConcurrentHashMap import com.twitter.finagle.Service import com.twitter.util.{Future => TwFuture, Promise => TwPromise} import com.typesafe.scalalogging.slf4j.LazyLogging import io.netty.util.CharsetUtil -import us.jubat.jubaql_server.processor.json.{AnomalyScore, ClassifierPrediction, ClassifierResult, DatumResult} -import us.jubat.jubaql_server.processor.updater.{Anomaly, Classifier, Recommender} -import org.apache.spark.SparkContext -import org.apache.spark.sql.SQLContext +import RunMode.{Production, Development} +import us.jubat.jubaql_server.processor.json._ +import us.jubat.jubaql_server.processor.updater._ +import org.apache.spark.{SparkFiles, SparkContext} +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation} +import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Row} +import org.apache.spark.sql.catalyst.plans.logical.{Project, BinaryNode, LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.types._ +import org.apache.spark.sql.{SQLContext, SchemaRDD} +import org.apache.spark.storage.StorageLevel +import org.apache.spark.streaming.StreamingContext +import org.apache.spark.streaming.StreamingContext._ +import org.apache.spark.streaming.dstream.DStream import org.jboss.netty.buffer.ChannelBuffers import org.jboss.netty.handler.codec.http._ import org.json4s._ import org.json4s.native.{JsonMethods, Serialization} +import org.json4s.JsonDSL._ +import sun.misc.Signal import us.jubat.anomaly.AnomalyClient import us.jubat.classifier.ClassifierClient import us.jubat.common.Datum import us.jubat.recommender.RecommenderClient -import us.jubat.yarn.client.{JubatusYarnApplicationStatus, JubatusYarnApplication, Resource} +import us.jubat.yarn.client.{JubatusYarnApplication, JubatusYarnApplicationStatus, Resource} import us.jubat.yarn.common.{LearningMachineType, Location} import scala.collection._ import scala.collection.convert.decorateAsScala._ -import scala.concurrent.{Future => ScFuture, Promise => ScPromise, Await => ScAwait, SyncVar} -import scala.concurrent.duration._ import scala.concurrent.ExecutionContext.Implicits.global +import scala.concurrent.duration._ +import scala.concurrent.{Await => ScAwait, Future => ScFuture, Promise => ScPromise, SyncVar} import scala.util.{Failure, Random, Success, Try} -import sun.misc.Signal -class JubaQLService(sc: SparkContext, runMode: RunMode) +class JubaQLService(sc: SparkContext, runMode: RunMode, checkpointDir: String) extends Service[HttpRequest, HttpResponse] with LazyLogging { val random = new Random() val parser = new JubaQLParser() // alias name for parser is needed to override SQLContext's parser - val parserAlias = parser - val sqlc = new SQLContext(sc) { - override val parser = parserAlias - } - val sources: concurrent.Map[String, (HybridProcessor, StructType)] = - new ConcurrentHashMap[String, (HybridProcessor, StructType)]().asScala + val sqlc = new JubaQLContext(sc, parser) + + sqlc.registerFunction("highestScoreLabel", (classes: List[Row]) => { + // actually we have a List[(String, Double)], but we get a List[Row] + if (classes.isEmpty) + "" + else { + classes.maxBy(_.getDouble(1)).getString(0) + } + }) + + val sources: concurrent.Map[String, (HybridProcessor, Option[StructType])] = + new ConcurrentHashMap[String, (HybridProcessor, Option[StructType])]().asScala val models: concurrent.Map[String, (JubatusYarnApplication, CreateModel, LearningMachineType)] = new ConcurrentHashMap[String, (JubatusYarnApplication, CreateModel, LearningMachineType)]().asScala - val startedJubatusInstances: concurrent.Map[String, ScFuture[JubatusYarnApplication]] = - new ConcurrentHashMap[String, ScFuture[JubatusYarnApplication]]().asScala + val startedJubatusInstances: concurrent.Map[String, (ScFuture[JubatusYarnApplication], CreateModel, LearningMachineType)] = + new ConcurrentHashMap[String, (ScFuture[JubatusYarnApplication], CreateModel, LearningMachineType)]().asScala + + // hold all statements received from a client, together with the data source name + // TODO replace this by a synchronized version? + val preparedStatements: mutable.Queue[(String, PreparedJubaQLStatement)] = new mutable.Queue() + + // hold names of all usable table-like objects, mapping to their main data source name + val knownStreamNames: concurrent.Map[String, String] = + new ConcurrentHashMap[String, String]().asScala + + // hold feature functions written in JavaScript. + val featureFunctions: concurrent.Map[String, String] = + new ConcurrentHashMap[String, String]().asScala + + val builtinFeatureFunctions = Set("id") + + val jubatusFeatureFunctions = Set("unigram", "bigram") + + // a feature function is invalid if it is not in one of the three possible sets + def invalidFeatureFunctions(ffs: List[String]): Set[String] = { + ffs.toSet. + diff(featureFunctions.keySet). + diff(builtinFeatureFunctions). + diff(jubatusFeatureFunctions) + } // set this flag to `false` to prevent the HTTP server from processing queries protected val isAcceptingQueries: SyncVar[Boolean] = new SyncVar() isAcceptingQueries.put(true) - // set this flag to `true` to signal to executors they should stop processing - protected val executorsShouldFinishProcessing: SyncVar[Boolean] = new SyncVar() - executorsShouldFinishProcessing.put(false) + // set this value which will be communicated to executors via /status poll + protected val driverStatusMessage: SyncVar[String] = new SyncVar() + driverStatusMessage.put("running") // store a function to stop the UPDATE process (if one is running) protected var stopUpdateFunc: Option[() => (ProcessingInformation, ProcessingInformation)] = None @@ -93,10 +137,8 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) case "/status" => val resp = new DefaultHttpResponse(HttpVersion.HTTP_1_1, HttpResponseStatus.OK) - if (executorsShouldFinishProcessing.get == true) - resp.setContent(ChannelBuffers.copiedBuffer("shutdown", CharsetUtil.UTF_8)) - else - resp.setContent(ChannelBuffers.copiedBuffer("running", CharsetUtil.UTF_8)) + resp.setContent(ChannelBuffers.copiedBuffer(driverStatusMessage.get, + CharsetUtil.UTF_8)) TwFuture.value(resp) // if we get POSTed a statement, process it @@ -106,29 +148,30 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) // create an empty promise and create the processing pipeline val command = new TwPromise[String] - // TODO: use Either or Future semantics to transport success/failure information - val result: TwFuture[Option[String]] = command.map(parseJson).map(_.flatMap(takeAction)) + val parsedCommand: TwFuture[Either[(Int, String), JubaQLAST]] = + command.map(parseJson) + val actionResult: TwFuture[Either[(Int, String), JubaQLResponse]] = + parsedCommand.map(_.right.flatMap(takeAction)) // now actually put the received command in the promise, // triggering the processing command.setValue(body) // create an HttpResponse based on the result - val responseFuture = result.map(res => { - // pick HTTP response code and body + val responseFuture = actionResult.map(res => { + implicit val formats = DefaultFormats + // pick HTTP response code and render JSON body val (resp, bodyJson) = res match { - case Some(msg) => + case Left((httpStatusCode, errMsg)) => + // there was an error in some inner function + logger.warn("error during query processing: " + errMsg) (new DefaultHttpResponse(HttpVersion.HTTP_1_1, - HttpResponseStatus.OK), - // msg may already be a JSON string - // TODO: get this type-safe - if (msg.startsWith("{") || msg.startsWith("[")) - "{\"result\": %s}".format(msg) - else - "{\"result\": \"%s\"}".format(msg)) - case _ => + HttpResponseStatus.valueOf(httpStatusCode)), + Serialization.write(ErrorMessage(errMsg))) + case Right(result) => + // we got a result that we can render as JSON (new DefaultHttpResponse(HttpVersion.HTTP_1_1, - HttpResponseStatus.INTERNAL_SERVER_ERROR), - "{\"result\": \"error\"}") + HttpResponseStatus.OK), + Serialization.write(result)) } // add header and body resp.addHeader("Content-Type", "application/json; charset=utf-8") @@ -137,7 +180,6 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) resp.getStatus.getCode)) resp }) - logger.debug("[%s] request processing prepared".format(requestId)) responseFuture // return 404 in any other case @@ -149,48 +191,112 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) } } - protected def parseJson(in: String): Option[JubaQLAST] = { + protected def parseJson(in: String): Either[(Int, String), JubaQLAST] = { // parse string and extract the "query" field JsonMethods.parseOpt(in).map(_ \ "query") match { case Some(JString(queryString)) => try { - parser.parse(queryString) + parser.parse(queryString) match { + case None => + val msg = s"unable to parse queryString '$queryString'" + logger.error(msg) + Left((400, msg)) + case Some(result) => + Right(result) + } } catch { case e: Throwable => - logger.error(s"unable to parse queryString '$queryString': " + e.getMessage) - None + Left((400, s"unable to parse queryString '$queryString': " + e.getMessage)) } case Some(other) => - logger.warn(s"received JSON '$in' did not contain a query string") - None + val msg = s"received JSON '$in' did not contain a query string" + logger.warn(msg) + Left((400, msg)) + case None => + val msg = s"received string '$in' was not valid JSON" + logger.warn(msg) + Left((400, msg)) + } + } + + // takes a JSON-shaped string describing a Jubatus config and adds a + // default "converter" part if it is not present + protected def complementInputJson(inputJsonString: String): Either[(Int, String), JObject] = { + val defaultConverter = JObject( + "converter" -> JObject( + "num_filter_types" -> JObject(), + "num_filter_rules" -> JArray(Nil), + "string_filter_types" -> JObject(), + "string_filter_rules" -> JArray(Nil), + "num_types" -> JObject(), + "num_rules" -> JArray(JObject("key" -> "*", "type" -> "num") :: Nil), + // define two Jubatus-internal conversion methods + "string_types" -> JObject("unigram" -> JObject("method" -> "ngram", "char_num" -> "1"), + "bigram" -> JObject("method" -> "ngram", "char_num" -> "2")), + "string_rules" -> JArray( + // define rules how to recognize keys for internal conversion + JObject("key" -> "*-unigram-jubaconv", "type" -> "unigram", "sample_weight" -> "tf", "global_weight" -> "bin") :: + JObject("key" -> "*-bigram-jubaconv", "type" -> "bigram", "sample_weight" -> "tf", "global_weight" -> "bin") :: + JObject("key" -> "*", "except" -> "*-jubaconv", "type" -> "str", "sample_weight" -> "tf", "global_weight" -> "bin") :: Nil))) + + JsonMethods.parseOpt(inputJsonString) match { + case Some(obj: JObject) => + obj.values.get("converter") match { + case None => + // if the input has no converter, then append the default one + Right(obj ~ defaultConverter) + case _ => + // if the input *does* have a converter, use it as is + Right(obj) + } + + case Some(_) => + Left((400, "input config is not a JSON object.")) + case None => - logger.warn(s"received string '$in' was not valid JSON") - None + Left((400, "input config is not a JSON.")) } } - protected def takeAction(ast: JubaQLAST): Option[String] = { + protected def takeAction(ast: JubaQLAST): Either[(Int, String), JubaQLResponse] = { ast match { case anything if isAcceptingQueries.get == false => - logger.warn(s"received $anything while shutting down, not taking action") - // propagate message to client - None + val msg = s"received $anything while shutting down, not taking action" + logger.warn(msg) + Left((503, msg)) case cd: CreateDatasource => - val processor = new HybridProcessor(sc, sqlc, cd.sinkStorage, cd.sinkStreams) - // TODO schema must be optional - val schema = StructType(cd.columns.map { - case (colName, dataType) => { - StructField(colName, dataType.toLowerCase match { - case "numeric" => LongType - case "string" => StringType - case "boolean" => BooleanType - case _ => ??? - }, false) + if (knownStreamNames.contains(cd.sourceName)) { + val msg = "data source '%s' already exists".format(cd.sourceName) + logger.warn(msg) + Left((400, msg)) + } else { + val processor = new HybridProcessor(sc, sqlc, + cd.sinkStorage, cd.sinkStreams, + runMode, + checkpointDir) + val maybeSchema = cd.columns match { + case Nil => + None + case cols => + Some(StructType(cols.map { + case (colName, dataType) => { + StructField(colName, dataType.toLowerCase match { + case "numeric" => DoubleType + case "string" => StringType + case "boolean" => BooleanType + case _ => ??? + }, nullable = false) + } + })) } - }) - sources.put(cd.sourceName, (processor, schema)) - Some("CREATE DATASOURCE") + // register this datasource internally so subsequent statements + // can look it up + sources.put(cd.sourceName, (processor, maybeSchema)) + // data sources "point" to themselves + knownStreamNames += ((cd.sourceName, cd.sourceName)) + Right(StatementProcessed("CREATE DATASOURCE")) + } case cm: CreateModel => val jubaType: LearningMachineType = cm.algorithm match { @@ -201,6 +307,22 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) case "RECOMMENDER" => LearningMachineType.Recommender } + + // check if all feature functions exist + val badFFs = invalidFeatureFunctions(cm.featureExtraction.map(_._2)) + if (!badFFs.isEmpty) { + val msg = "unknown feature functions: " + badFFs.mkString(", ") + logger.warn(msg) + return Left((400, msg)) + } + + val configJsonStr: String = complementInputJson(cm.configJson) match { + case Left((errCode, errMsg)) => + return Left((errCode, errMsg)) + case Right(config) => + import JsonMethods._ + compact(render(config)) + } // TODO: location, resource val resource = Resource(priority = 0, memory = 256, virtualCores = 1) val juba: ScFuture[JubatusYarnApplication] = runMode match { @@ -208,118 +330,535 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) val location = zookeeper.map { case (host, port) => Location(InetAddress.getByName(host), port) } - JubatusYarnApplication.start(cm.modelName, jubaType, location, cm.configJson, resource, 2) + JubatusYarnApplication.start(cm.modelName, jubaType, location, configJsonStr, resource, 2) case RunMode.Development => - LocalJubatusApplication.start(cm.modelName, jubaType, cm.configJson) + LocalJubatusApplication.start(cm.modelName, jubaType, configJsonStr) } // we keep a reference to the started instance so we can always check its status // and wait for it to come up if necessary val startedInstance = ScPromise[JubatusYarnApplication]() - startedJubatusInstances.put(cm.modelName, startedInstance.future) + startedJubatusInstances.put(cm.modelName, (startedInstance.future, cm, jubaType)) juba onComplete { case Success(j) => logger.info("CREATE MODEL succeeded") models.put(cm.modelName, (j, cm, jubaType)) startedInstance.completeWith(juba) case Failure(t) => - logger.info("CREATE MODEL failed") + logger.warn("CREATE MODEL failed: " + t.getMessage) t.printStackTrace() startedInstance.completeWith(juba) } - Some("CREATE MODEL (started)") + Right(StatementProcessed("CREATE MODEL (started)")) - case update: Update => - var model: JubatusYarnApplication = null - var jubaType: LearningMachineType = null - var cm: CreateModel = null - // wait until model is available (when Jubatus is started) or timeout - startedJubatusInstances.get(update.modelName).foreach(jubaFut => { - if (!jubaFut.isCompleted) { - logger.debug("waiting for model %s to come up".format(update.modelName)) - ScAwait.ready(jubaFut, 1 minute) - } - }) - val maybeModel = models.get(update.modelName) - maybeModel match { - case Some((s, c, ty)) => (s, c, ty) - model = s - cm = c - jubaType = ty - case None => - // TODO: error message - logger.error("model not found") - return None + case CreateStreamFromSelect(streamName, selectPlan) => + if (knownStreamNames.contains(streamName)) { + val msg = s"stream '$streamName' already exists" + logger.warn(msg) + Left((400, msg)) + } else { + val refStreams = selectPlan.children.flatMap(collectAllChildren) + withStreams(refStreams)(mainDataSource => { + // register this stream internally + knownStreamNames += ((streamName, mainDataSource)) + preparedStatements.enqueue((mainDataSource, PreparedCreateStreamFromSelect(streamName, + selectPlan, refStreams.toList))) + Right(StatementProcessed("CREATE STREAM")) + }) } - // Note: Theoretically it would as well be possible to address the jubatus - // instances directly by looking at `model.jubatusServers`. - val jubaHost = model.jubatusProxy.hostAddress - val jubaPort = model.jubatusProxy.port - val trainSpecifier = cm.specifier.toMap - val keys = trainSpecifier.get("datum") match { - case Some(list) if list.nonEmpty => list - case _ => ??? // TODO: throw exception. datum not specified + case CreateStreamFromSlidingWindow(streamName, windowSize, slideInterval, + windowType, source, funcSpecs, postCond) => + // pick the correct aggregate functions for the given aggregate list + val checkedFuncSpecs = funcSpecs.map { + case (funcName, params, alias) => + val maybeAggFun: Either[String, (SomeAggregateFunction[_])] = try { + funcName.toLowerCase match { + case "avg" => + AggregateFunctions.checkAvgParams(params) + case "stddev" => + AggregateFunctions.checkStdDevParams(params) + case "quantile" => + AggregateFunctions.checkQuantileParams(params) + case "linapprox" => + AggregateFunctions.checkLinApproxParams(params) + case "fourier" => + AggregateFunctions.checkFourierParams(params) + case "wavelet" => + AggregateFunctions.checkWaveletParams(params) + case "histogram" => + AggregateFunctions.checkHistogramParams(params) + case "concat" => + AggregateFunctions.checkConcatParams(params) + case "maxelem" => + AggregateFunctions.checkMaxElemParams(params) + case other => + Left("unknown aggregation function: " + other) + } + } catch { + case e: Throwable => + Left("error while checking " + funcName + ": " + + e.getMessage) + } + maybeAggFun match { + case Left(msg) => + Left(msg) + case Right(aggFun) => + Right((funcName, aggFun, alias)) + } + } + // check if we have any errors in the aggregate list + val errors = checkedFuncSpecs.collect { + case Left(msg) => msg } + if (errors.size > 0) { + val msg = "invalid parameter specification: " + errors.mkString(", ") + logger.warn(msg) + Left((400, msg)) + } else if (knownStreamNames.contains(streamName)) { + val msg = s"stream '$streamName' already exists" + logger.warn(msg) + Left((400, msg)) + } else { + val refStreams = source.children.flatMap(collectAllChildren) - val updater = jubaType match { - case LearningMachineType.Anomaly if update.rpcName == "add" => - new Anomaly(jubaHost, jubaPort, cm, keys) + withStreams(refStreams)(mainDataSource => { + // register this stream internally + knownStreamNames += ((streamName, mainDataSource)) + val flattenedFuncs = checkedFuncSpecs.collect{ case Right(x) => x } + // build the schema that will result from this statement + // (add one additional column with the window timestamp if the + // window is timestamp-based) + val typeInfo = flattenedFuncs.map(c => (c._1, c._2.outType, c._3)) + val schemaHead = if (windowType == "time") + StructField("jubaql_timestamp", StringType, nullable = false) :: Nil + else + Nil + val schema = StructType(schemaHead ++ typeInfo.zipWithIndex.map{ + case ((funcName, dataType, maybeAlias), idx) => + // if there was an AS given in the statement, fine. if not, + // use the function name (or function name + dollar + index + // if the same function is used multiple times). + val alias = maybeAlias.getOrElse({ + if (typeInfo.filter(f => f._3.isEmpty && f._1 == funcName).size > 1) + funcName + "$" + idx + else + funcName + }) + StructField(alias, dataType, nullable = false) + }) + // at this point, the `source` already has the pre-condition applied + // and the correct columns selected. however, we still need to add the + // right casts to Double/String. + val headColumns = if (windowType == "time") + Alias(Cast(UnresolvedAttribute("jubaql_timestamp"), StringType), + "key")() :: Nil + else + Nil + val projectedSource = source.asInstanceOf[Project] + val sourceWithCast = Project(headColumns ++ + projectedSource.projectList.zip(flattenedFuncs).map{ + case (a: Alias, funcDesc) => + Alias(Cast(a.child, funcDesc._2.inType), a.name)() + case (other, funcDesc) => + Alias(Cast(other, funcDesc._2.inType), other.name)() + }, projectedSource.child) + val functionObjects = flattenedFuncs.map(_._2) + preparedStatements.enqueue((mainDataSource, PreparedCreateStreamFromSlidingWindow(streamName, + windowSize, slideInterval, windowType, sourceWithCast, functionObjects, + schema, postCond))) + Right(StatementProcessed("CREATE STREAM")) + }) + } - case LearningMachineType.Classifier if update.rpcName == "train" => - val label = trainSpecifier.get("label") match { - case Some(la :: Nil) => la - case _ => ??? // TODO: throw exception + case cs: CreateStreamFromAnalyze => + val validCombination: (LearningMachineType, String) => Boolean = { + case (LearningMachineType.Anomaly, "calc_score") => true + case (LearningMachineType.Classifier, "classify") => true + case (LearningMachineType.Recommender, "complete_row_from_id") => true + case (LearningMachineType.Recommender, "complete_row_from_datum") => true + case _ => false + } + if (knownStreamNames.contains(cs.streamName)) { + val msg = "stream '%s' already exists".format(cs.streamName) + logger.warn(msg) + Left((400, msg)) + } else { + withStream(cs.analyze.data)(mainDataSource => { + prepareJubaClient(cs.analyze.modelName, cs.analyze.data, cs.analyze.rpcName, + validCombination) match { + case Right((modelFut, analyzerFut)) => + // register this stream internally + knownStreamNames += ((cs.streamName, mainDataSource)) + // put the UPDATE statement in the statement queue + preparedStatements.enqueue((mainDataSource, PreparedCreateStreamFromAnalyze(cs.streamName, + cs.analyze.modelName, modelFut, + cs.analyze.data, analyzerFut, cs.analyze.rpcName, + cs.newColumn))) + Right(StatementProcessed("CREATE STREAM")) + case Left((code, msg)) => + Left((code, msg)) } - new Classifier(jubaHost, jubaPort, cm, keys) + }) + } + + case CreateTrigger(dsName, condition, function) => + function match { + case f: UnresolvedFunction => + JavaScriptUDFManager.getNumberOfArgsByFunctionName(f.name) match { + case None => + val msg = s"no user-defined function named ${f.name}" + logger.error(msg) + return Left((400, msg)) - case LearningMachineType.Recommender if update.rpcName == "update_row" => - val id = trainSpecifier.get("id") match { - case Some(id :: Nil) => id - case _ => ??? // TODO: throw exception + case Some(nargs) if nargs != f.children.size => + val msg = s"number of arguments is mismatched (number of arguments of ${f.name}} is ${f.children.size}})" + logger.error(msg) + return Left((400, msg)) + + case _ => + // do nothing } - new Recommender(jubaHost, jubaPort, cm, id, keys) - - case lmt => - logger.error("no matching learning machine for " + lmt) - return None - } - - // Start to process RDD - try sources.get(update.source) match { - case Some((rddProcessor, schema)) => - logger.info("UPDATE started") - val (host, port) = JubaQLProcessor.getListeningAddress - val statusUrl = "http://%s:%s/status".format(host.getHostAddress, port) - val stopFun = rddProcessor.start(rddjson => { - rddjson.mapPartitions(updater(_, statusUrl)) - })._1 - // store the function to stop processing - stopUpdateFunc = Some(() => stopFun()) - Some("UPDATE MODEL") + case _ => + val msg = "unintentional Spark SQL builtin function" + logger.error(msg) + return Left((400, msg)) + } + withStream(dsName)(mainDataSource => { + preparedStatements.enqueue((mainDataSource, + PreparedCreateTrigger(dsName, condition, function))) + Right(StatementProcessed("CREATE TRIGGER")) + }) + + case LogStream(streamName) => + withStream(streamName)(mainDataSource => { + preparedStatements.enqueue((mainDataSource, PreparedLogStream(streamName))) + Right(StatementProcessed("LOG STREAM")) + }) + case update: Update => + val validCombination: (LearningMachineType, String) => Boolean = { + case (LearningMachineType.Anomaly, "add") => true + case (LearningMachineType.Classifier, "train") => true + case (LearningMachineType.Recommender, "update_row") => true + case _ => false + } + withStream(update.source)(mainDataSource => { + prepareJubaClient(update.modelName, update.source, update.rpcName, + validCombination) match { + case Right((modelFut, updaterFut)) => + // put the UPDATE statement in the statement queue + preparedStatements.enqueue((mainDataSource, PreparedUpdate(update.modelName, modelFut, + update.source, updaterFut))) + Right(StatementProcessed("UPDATE MODEL")) + case Left((code, msg)) => + Left((code, msg)) + } + }) + + case StartProcessing(sourceName) => + sources.get(sourceName) match { case None => - // TODO: error message - logger.error("source '%s' not found".format(update.source)) - None + val msg = "unknown data source: " + sourceName + logger.warn(msg) + Left((400, msg)) + case Some((processor, _)) if processor.state != Initialized => + val msg = "cannot start processing a data source in state " + processor.state + logger.warn(msg) + Left((400, msg)) + case Some((processor, _)) if sources.values.exists(_._1.state == Running) => + val msg = "there is already a running process, try to run STOP PROCESSING first" + logger.warn(msg) + Left((400, msg)) + case Some((processor, maybeSchema)) => + logger.info(s"setting up processing pipeline for data source '$sourceName' " + + s"with given schema $maybeSchema") + + val rddOperations: mutable.Queue[Either[(Int, String), StreamingContext => Unit]] = + preparedStatements.filter(_._1 == sourceName).map(_._2).map(stmt => { + logger.debug(s"deal with $stmt") + stmt match { + // CREATE STREAM ... FROM SELECT ... + // => execute a select and register the result as a table + case PreparedCreateStreamFromSelect(streamName, selectPlan, _) => + logger.info(s"adding 'CREATE STREAM $streamName FROM SELECT ...' to pipeline") + Right((ssc: StreamingContext) => { + logger.debug(s"executing 'CREATE STREAM $streamName FROM SELECT ...'") + SchemaDStream.fromSQL(ssc, sqlc, + selectPlan, Some(streamName)) + () + }) + + // CREATE STREAM ... FROM SLIDING WINDOW ... + case PreparedCreateStreamFromSlidingWindow(streamName, windowSize, + slideInterval, windowType, source, funcSpecs, outSchema, maybePostCond) => + logger.info(s"adding 'CREATE STREAM $streamName FROM SLIDING WINDOW ...' to pipeline") + val fun = (ssc: StreamingContext) => { + logger.debug(s"executing 'CREATE STREAM $streamName FROM SLIDING WINDOW ...'") + // NB. the precondition is already applied in the `source` + val inputStream = SchemaDStream.fromSQL(ssc, sqlc, source, None) + val rowStream = inputStream.dataStream + val schemaStream = inputStream.schemaStream + + // compute window stream + val windowStream = if (windowType == "tuples") { + SlidingWindow.byCount(rowStream, windowSize, slideInterval) + } else { + // the first column is the timestamp by construction + val keyedRowStream = rowStream.map(row => { + (Helpers.parseTimestamp(row.getString(0)), + Row(row.tail: _*)) + }) + // compute window stream + SlidingWindow.byTimestamp(keyedRowStream, + windowSize, slideInterval) + } + + // if we access the window stream more than once, cache it + if (funcSpecs.size > 1) { + windowStream.persist(StorageLevel.MEMORY_AND_DISK_SER) + } + // apply the i-th aggregate function on the i-th element + // of the selected row + val aggregatedStreams = funcSpecs.zipWithIndex.map{ + case (f: DoubleInputAggFun, idx) => + val doubleStream = windowStream.mapValues(rowWithKey => + (rowWithKey._1, rowWithKey._2.getDouble(idx))) + doubleStream.transform(f.aggFun _) + case (f: StringInputAggFun, idx) => + val stringStream = windowStream.mapValues(rowWithKey => + (rowWithKey._1, rowWithKey._2.getString(idx))) + stringStream.transform(f.aggFun _) + } + // merge the aggregated columns together + val firstStream = aggregatedStreams.head.mapValues(_ :: Nil) + val combinedStream = aggregatedStreams.tail + .foldLeft(firstStream)((left, right) => { + left.join(right).mapValues(lr => lr._1 :+ lr._2) + }).transform(_.sortByKey()) + // convert to Rows and add schema + val outRowStream = if (windowType == "tuples") { + combinedStream.map(keyVal => Row(keyVal._2 :_*)) + } else { + combinedStream.map(keyVal => { + val data = Helpers.formatTimestamp(keyVal._1) :: keyVal._2 + Row(data :_*) + }) + } + val outSchemaCopy = outSchema // outSchema is not serializable + val outSchemaStream = schemaStream.map(_ => outSchemaCopy) + // apply the post condition ("HAVING") if present + val filteredOutRowStream = maybePostCond.map(postCond => { + outRowStream.transform(rdd => { + val schemaRdd = sqlc.applySchema(rdd, outSchemaCopy) + schemaRdd.where(postCond) + }) + }).getOrElse(outRowStream) + SchemaDStream(sqlc, filteredOutRowStream, outSchemaStream) + .registerStreamAsTable(streamName) + () + } + Right(fun) + + // CREATE STREAM ... FROM ANALYZE ... + // => run updater.analyze on each partition + case PreparedCreateStreamFromAnalyze(streamName, modelName, + modelFut, dataSourceName, analyzerFut, rpcName, newColumn) => + // wait until model is available (when Jubatus is started) or timeout + if (!modelFut.isCompleted) { + logger.debug("waiting for model %s to come up".format(modelName)) + } else { + logger.debug("model %s is already up".format(modelName)) + } + val maybeModel = Try(ScAwait.result(modelFut, 1.minute)) + maybeModel match { + case Failure(t) => + val msg = "model %s failed to start up: %s".format( + modelName, t.getMessage) + logger.error(msg) + Left((500, msg)) + + case Success(juba) => + // wait until updater is ready or timeout + Try(ScAwait.result(analyzerFut, 1.minute)) match { + case Failure(t) => + val msg = "cannot use model %s: %s".format( + modelName, t.getMessage) + logger.error(msg) + Left((500, msg)) + + case Success(updater) => + val (host, port) = JubaQLProcessor.getListeningAddress + val statusUrl = "http://%s:%s/status".format(host.getHostAddress, port) + + logger.info(s"adding 'CREATE STREAM $streamName FROM ANALYZE ...' to pipeline") + Right((ssc: StreamingContext) => { + logger.debug(s"executing 'CREATE STREAM $streamName FROM ANALYZE ...'") + SchemaDStream.fromRDDTransformation(ssc, sqlc, dataSourceName, tmpRdd => { + val rddSchema: StructType = tmpRdd.schema + val analyzeFun = UpdaterAnalyzeWrapper(rddSchema, statusUrl, + updater, rpcName) + val newSchema = StructType(rddSchema.fields :+ + StructField(newColumn.getOrElse(rpcName), + analyzeFun.dataType, nullable = false)) + val newRdd = sqlc.applySchema(tmpRdd.mapPartitionsWithIndex((idx, iter) => { + val formatter = new SimpleDateFormat("HH:mm:ss.SSS") + val hostname = InetAddress.getLocalHost().getHostName() + println("%s @ %s [%s] DEBUG analyzing model from partition %d".format( + formatter.format(new Date), hostname, Thread.currentThread().getName, idx + )) + iter + }).mapPartitions(analyzeFun.apply(_)), + newSchema) + newRdd + }, Some(streamName)) + () + }) + } + } + + case PreparedCreateTrigger(dsName, condition, expr) => + logger.info(s"adding 'CREATE TRIGGER $dsName' to pipeline") + Right((ssc: StreamingContext) => { + logger.debug(s"executing 'CREATE TRIGGER $dsName'") + SchemaDStream.fromTableName(ssc, sqlc, dsName).foreachRDD(rdd => { + val rddWithCondition = condition match { + case None => + rdd + case Some(c) => + rdd.where(c) + } + rddWithCondition.select(expr).collect() // count() does not work here. + () + }) + }) + + case PreparedLogStream(streamName) => + logger.info(s"adding 'LOG STREAM $streamName' to pipeline") + Right((ssc: StreamingContext) => { + SchemaDStream.fromTableName(ssc, sqlc, streamName).foreachRDD(rdd => { + logger.debug(s"executing 'LOG STREAM $streamName'") + val dataToPrint = rdd.take(101) + val hasMoreData = dataToPrint.size == 101 + val ellipsis = + if (hasMoreData) "\n( ... more items ...)" + else "" + println("STREAM: " + streamName + "\n" + + rdd.schema.fields.map(sf => + sf.name + " " + sf.dataType).mkString(" | ") + "\n" + + dataToPrint.take(100).map(row => row.mkString(" | ")).mkString("\n") + + ellipsis + ) + }) + () + }) + + // UPDATE MODEL ... USING ... + // => run updater.apply on each partition + case PreparedUpdate(modelName, modelFut, dataSourceName, updaterFut) => + // wait until model is available (when Jubatus is started) or timeout + if (!modelFut.isCompleted) { + logger.debug("waiting for model %s to come up".format(modelName)) + } else { + logger.debug("model %s is already up".format(modelName)) + } + val maybeModel = Try(ScAwait.result(modelFut, 1.minute)) + maybeModel match { + case Failure(t) => + val msg = "model %s failed to start up: %s".format( + modelName, t.getMessage) + logger.error(msg) + Left((500, msg)) + + case Success(juba) => + // wait until updater is ready or timeout + Try(ScAwait.result(updaterFut, 1.minute)) match { + case Failure(t) => + val msg = "cannot update model %s: %s".format( + modelName, t.getMessage) + logger.error(msg) + Left((500, msg)) + + case Success(updater) => + val (host, port) = JubaQLProcessor.getListeningAddress + val statusUrl = "http://%s:%s/status".format(host.getHostAddress, port) + + logger.info(s"adding 'UPDATE MODEL $modelName ...' to pipeline") + Right((ssc: StreamingContext) => { + SchemaDStream.fromTableName(ssc, sqlc, dataSourceName).foreachRDD(tmpRdd => { + logger.debug(s"executing 'UPDATE MODEL $modelName ...'") + val rddSchema: StructType = tmpRdd.schema + val updateFun = UpdaterApplyWrapper(rddSchema, statusUrl, updater) + // NOTE: you can add sample(...) here to work only on a subset of the items + tmpRdd.mapPartitionsWithIndex((idx, iter) => { + val formatter = new SimpleDateFormat("HH:mm:ss.SSS") + val hostname = InetAddress.getLocalHost().getHostName() + println("%s @ %s [%s] DEBUG updating model with partition %d".format( + formatter.format(new Date), hostname, Thread.currentThread().getName, idx + )) + iter + }).foreachPartition(updateFun.apply) + }) + }) + } + } + + // unknown statement type + case _ => + ??? + } + }) + logger.info("pipeline setup complete (%d items)".format(rddOperations.size)) + + rddOperations.collectFirst{ case Left(errDesc) => errDesc } match { + // there was an error during pipeline setup + case Some((code, msg)) => + Left((code, msg)) + + // there was no error, but also no instructions + case None if rddOperations.isEmpty => + val msg = "there are no processing instructions" + logger.warn(msg) + Left((400, msg)) + + // there was no error + case None => + def transform: SchemaDStream => Unit = inputStream => { + inputStream.registerStreamAsTable(sourceName) + val context = inputStream.dataStream.context + rddOperations.collect{ case Right(fun) => fun }.foreach(_.apply(context)) + } + logger.info("starting HybridProcessor with created pipeline") + val stopFun = processor.startTableProcessingGeneral(transform, + maybeSchema, sourceName)._1 + stopUpdateFunc = Some(() => stopFun()) + Right(StatementProcessed("START PROCESSING")) + } } case ana: Analyze => queryAnalyze(ana) match { - case Some(toReturn) => - Some(toReturn) - case None => - logger.error("no ANALYZE result for " + ana) - None + case Left(msgWithErrCode) => + Left(msgWithErrCode) + case Right(anaResult) => + Right(AnalyzeResultWrapper(anaResult)) } + case s: Status => + val dsStatus = sources.mapValues(_._1.state.toString) + val jubaStatus = models.mapValues(_._1 match { + case dummy: LocalJubatusApplication => "OK" + case real => real.status.toString + }) + Right(StatusResponse("STATUS", dsStatus.toMap, jubaStatus.toMap)) + case s: Shutdown => // first set a flag to stop further query processing isAcceptingQueries.set(false) // NB. put() has different semantics - // stop stream processing - val procStats = stopUpdateFunc match { + // stop stream processing + val procStats = stopUpdateFunc match { case Some(func) => - Some(stopStreamProcessing(func)) + Some(stopStreamProcessing(func, forShutdown = true)) case _ => logger.info("apparently there was no stream processing running") None @@ -328,7 +867,7 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) // ever started, independent of complete (successful or failed) or still // starting: val stoppedJubaFut: Iterable[ScFuture[Unit]] = startedJubatusInstances.map { - case (modelName, jubaFut) => + case (modelName, (jubaFut, _, _)) => logger.debug(s"scheduling shutdown for model $modelName") // If the startup failed, no need to shutdown. For all non-failed // instances (still starting or started successfully), we schedule @@ -337,52 +876,505 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) } // now convert a list of futures into a future of list and wait until completion logger.info("waiting for all Jubatus instances to shut down") - ScAwait.ready(ScFuture.sequence(stoppedJubaFut), 1 minute) + ScAwait.ready(ScFuture.sequence(stoppedJubaFut), 1.minute) // send a KILL signal to us to trigger Spark and Finagle shutdown Signal.raise(new Signal("TERM")) procStats match { case Some((staticInfo, streamInfo)) => - Some("SHUTDOWN (processing time: %s ms/%s ms)".format( - staticInfo.runtime, streamInfo.runtime)) + Right(StatementProcessed("SHUTDOWN (processing time: %s ms/%s ms)".format( + staticInfo.runtime, streamInfo.runtime))) case _ => - Some("SHUTDOWN") + Right(StatementProcessed("SHUTDOWN")) } case sp: StopProcessing => stopUpdateFunc match { case Some(func) => - val (staticInfo, streamInfo) = stopStreamProcessing(func) + val (staticInfo, streamInfo) = stopStreamProcessing(func, forShutdown = false) stopUpdateFunc = None - Some("STOP PROCESSING (processing time: %s ms/%s ms)".format( - staticInfo.runtime, streamInfo.runtime)) + Right(StatementProcessed("STOP PROCESSING (processing time: %s ms/%s ms)".format( + staticInfo.runtime, streamInfo.runtime))) case _ => - logger.warn("apparently there was no stream processing running") - None + val msg = "apparently there was no stream processing running" + logger.warn(msg) + Left((400, msg)) + } + + case CreateFunction(funcName, args, returnType, lang, body) => + // TODO: write log + // TODO: pass all args + if (!lang.equalsIgnoreCase("JavaScript")) + return Left((400, "only JavaScript is supported")) + if (args.isEmpty) + return Left((400, "args should contain at least one element")) + + val argString = args.map(_._1).mkString(", ") + val funcBody = s"function $funcName($argString) { $body }" + // try to find bugs in the syntax early + try { + JavaScriptUDFManager.register(funcName, args.size, funcBody) + } catch { + case e: Throwable => + // TODO: better message + return Left((400, e.getMessage)) + } + + val validTypes = "numeric" :: "string" :: "boolean" :: Nil + args.length match { + case n if n <= 0 => + Left((400, "number of arguments must be more than zero.")) + + case _ if !validTypes.contains(returnType) => + Left((400, "bad return type")) + + // def nArgsString(nArgs: Int): String = + // (0 until nArgs).map(n => s"x$n").mkString(", ") + // + // def nParamsString(nParams: Int): String = { + // (0 until nParams).map(n => s"x$n: AnyRef").mkString(", ") + // } + // + // def caseTypeString(sqlType: String, scalaType: String, defaultValue: String, nArgs: Int): String = { + // val args = nArgsString(nArgs) + // val params = nParamsString(nArgs) + // s"""case "$sqlType" => + // | sqlc.registerFunction(funcName, ($params) => { + // | JavaScriptUDFManager.registerAndCall[$scalaType](funcName, + // | $nArgs, funcBody, $args).getOrElse($defaultValue) + // | })""".stripMargin + // } + // + // def caseNArgs(nArgs: Int): String = { + // val numericCase = caseTypeString("numeric", "Double", "0.0", nArgs).split("\n").map(" " + _).mkString("\n") + // val stringCase = caseTypeString("string", "String", "\"\"", nArgs).split("\n").map(" " + _).mkString("\n") + // val booleanCase = caseTypeString("boolean", "Boolean", "false", nArgs).split("\n").map(" " + _).mkString("\n") + // s"""case $nArgs => + // | returnType match { + // |$numericCase + // |$stringCase + // |$booleanCase + // | } + // | Right(StatementProcessed("CREATE FUNCTION")) + // |""".stripMargin + // } + // + // following cases are generated with the above script. + case 1 => + returnType match { + case "numeric" => + sqlc.registerFunction(funcName, (x0: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Double](funcName, + 1, funcBody, x0).getOrElse(0.0) + }) + case "string" => + sqlc.registerFunction(funcName, (x0: AnyRef) => { + JavaScriptUDFManager.registerAndCall[String](funcName, + 1, funcBody, x0).getOrElse("") + }) + case "boolean" => + sqlc.registerFunction(funcName, (x0: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Boolean](funcName, + 1, funcBody, x0).getOrElse(false) + }) + } + Right(StatementProcessed("CREATE FUNCTION")) + + case 2 => + returnType match { + case "numeric" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Double](funcName, + 2, funcBody, x0, x1).getOrElse(0.0) + }) + case "string" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef) => { + JavaScriptUDFManager.registerAndCall[String](funcName, + 2, funcBody, x0, x1).getOrElse("") + }) + case "boolean" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Boolean](funcName, + 2, funcBody, x0, x1).getOrElse(false) + }) + } + Right(StatementProcessed("CREATE FUNCTION")) + + case 3 => + returnType match { + case "numeric" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Double](funcName, + 3, funcBody, x0, x1, x2).getOrElse(0.0) + }) + case "string" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef) => { + JavaScriptUDFManager.registerAndCall[String](funcName, + 3, funcBody, x0, x1, x2).getOrElse("") + }) + case "boolean" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Boolean](funcName, + 3, funcBody, x0, x1, x2).getOrElse(false) + }) + } + Right(StatementProcessed("CREATE FUNCTION")) + + case 4 => + returnType match { + case "numeric" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Double](funcName, + 4, funcBody, x0, x1, x2, x3).getOrElse(0.0) + }) + case "string" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef) => { + JavaScriptUDFManager.registerAndCall[String](funcName, + 4, funcBody, x0, x1, x2, x3).getOrElse("") + }) + case "boolean" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Boolean](funcName, + 4, funcBody, x0, x1, x2, x3).getOrElse(false) + }) + } + Right(StatementProcessed("CREATE FUNCTION")) + + case 5 => + returnType match { + case "numeric" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef, x4: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Double](funcName, + 5, funcBody, x0, x1, x2, x3, x4).getOrElse(0.0) + }) + case "string" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef, x4: AnyRef) => { + JavaScriptUDFManager.registerAndCall[String](funcName, + 5, funcBody, x0, x1, x2, x3, x4).getOrElse("") + }) + case "boolean" => + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef, x4: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Boolean](funcName, + 5, funcBody, x0, x1, x2, x3, x4).getOrElse(false) + }) + } + Right(StatementProcessed("CREATE FUNCTION")) + + case _ => + Left((400, "too many arguments")) + } + + case CreateFeatureFunction(funcName, args, lang, body) => + if (!lang.equalsIgnoreCase("JavaScript")) { + val msg = s"language $lang is not supported" + logger.warn(msg) + return Left((400, msg)) + } + if (args.isEmpty) { + val msg = s"a function shall have at least one element" + logger.warn(msg) + return Left((400, msg)) + } + + val argString = args.map(_._1).mkString(", ") + val funcBody = s"function $funcName($argString) { $body }" + // try to find bugs in the syntax early + try { + JavaScriptFeatureFunctionManager.register(funcName, args.size, funcBody) + } catch { + case e: Throwable => + val msg = f"the function has syntax error: ${e.getMessage}" + logger.warn(msg) + return Left((400, msg)) + } + + featureFunctions += (funcName -> funcBody) + Right(StatementProcessed("CREATE FEATURE FUNCTION")) + + case CreateTriggerFunction(funcName, args, lang, body) => + // TODO: write log + // TODO: pass all args + if (!lang.equalsIgnoreCase("JavaScript")) { + val msg = s"language $lang is not supported" + logger.warn(msg) + return Left((400, msg)) + } + if (args.isEmpty) { + val msg = s"a function shall have at least one element" + logger.warn(msg) + return Left((400, msg)) + } + + val argString = args.map(_._1).mkString(", ") + val funcBody = s"function $funcName($argString) { $body }" + // try to find bugs in the syntax early + try { + JavaScriptUDFManager.register(funcName, args.size, funcBody) + } catch { + case e: Throwable => + val msg = f"the function has syntax error: ${e.getMessage}" + logger.warn(msg) + return Left((400, msg)) + } + + args.length match { + case 1 => + // Returns an Int value because registerFunction does not accept a function which returns Unit. + // The Int value is not used. + sqlc.registerFunction(funcName, (x0: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Int](funcName, + 1, funcBody, x0).getOrElse(0) + }) + Right(StatementProcessed("CREATE TRIGGER FUNCTION")) + + case 2 => + // Returns Int for the above reason. + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Int](funcName, + 2, funcBody, x0, x1).getOrElse(0) + }) + Right(StatementProcessed("CREATE TRIGGER FUNCTION")) + + case 3 => + // Returns Int for the above reason. + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Int](funcName, + 3, funcBody, x0, x1, x2).getOrElse(0) + }) + Right(StatementProcessed("CREATE TRIGGER FUNCTION")) + + case 4 => + // Returns Int for the above reason. + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Int](funcName, + 4, funcBody, x0, x1, x2, x3).getOrElse(0) + }) + Right(StatementProcessed("CREATE TRIGGER FUNCTION")) + + case 5 => + // Returns Int for the above reason. + sqlc.registerFunction(funcName, (x0: AnyRef, x1: AnyRef, x2: AnyRef, x3: AnyRef, x4: AnyRef) => { + JavaScriptUDFManager.registerAndCall[Int](funcName, + 5, funcBody, x0, x1, x2, x3, x4).getOrElse(0) + }) + Right(StatementProcessed("CREATE TRIGGER FUNCTION")) + + case _ => + val msg = "too many arguments" + logger.warn(msg) + Left((400, msg)) } case other => - logger.error("no handler for " + other) - None + val msg = "no handler for " + other + logger.error(msg) + Left((500, msg)) + } + } + + // collect all tables referenced in a statement + protected def collectAllChildren(plan: LogicalPlan): Seq[String] = plan match { + case un: UnaryNode => + collectAllChildren(un.child) + case bn: BinaryNode => + bn.children.flatMap(collectAllChildren) + case UnresolvedRelation(tableIdentifier, _) => + tableIdentifier + case other => + Nil + } + + protected def prepareJubaClient(modelName: String, sourceName: String, rpcName: String, + validCombination: (LearningMachineType, String) => Boolean): + Either[(Int, String), (ScFuture[JubatusYarnApplication], ScFuture[JubatusClient])] = { + // check if the specified model exists (or at least, was started) + startedJubatusInstances.get(modelName) match { + // no such model was defined before + case None => + val msg = "no model called '%s'".format(modelName) + logger.info(msg) + Left((400, msg)) + + // a model was defined before + case Some((jubaFut, cm, jubaType)) => + jubaFut.value match { + // complete, but with failure + case Some(Failure(t)) => + val msg = "model %s failed to start up".format(modelName) + logger.error(msg) + Left((500, msg)) + + // not yet complete (but started) or succeeded + case _ => + // check if the specified stream exists + if (knownStreamNames.contains(sourceName)) { + // we prepare an instance of Update that only needs host and port + // of the proxy when Jubatus is ready + val almostAnUpdater: Try[(String, Int) => JubatusClient] = Try({ + // set up a (host, port) => Updater function or throw an exception + jubaType match { + case lmt@LearningMachineType.Anomaly + if validCombination(lmt, rpcName) => + (jubaHost, jubaPort) => + new Anomaly(jubaHost, jubaPort, cm, featureFunctions) + + case lmt@LearningMachineType.Classifier + if validCombination(lmt, rpcName) => + val label = cm.labelOrId match { + case Some(("label", value)) => + value + case _ => + val msg = "no label for datum specified" + throw new IllegalArgumentException(msg) + } + (jubaHost, jubaPort) => + new Classifier(jubaHost, jubaPort, cm, featureFunctions, label) + + case lmt@LearningMachineType.Recommender + if validCombination(lmt, rpcName) => + val id = cm.labelOrId match { + case Some(("id", value)) => + value + case _ => + val msg = "no id for datum specified" + throw new IllegalArgumentException(msg) + } + (jubaHost, jubaPort) => + new Recommender(jubaHost, jubaPort, cm, featureFunctions, id) + + case otherAlgorithm => + val msg = "'%s' is not a valid method for %s".format( + rpcName, otherAlgorithm + ) + logger.warn(msg) + throw new IllegalArgumentException(msg) + } + }) + // if that was successful, schedule Updater creation when + // Jubatus is ready + almostAnUpdater match { + case Success(jubaCreator) => + val updaterFut: ScFuture[JubatusClient] = jubaFut.map(model => { + val jubaHost = model.jubatusProxy.hostAddress + val jubaPort = model.jubatusProxy.port + jubaCreator(jubaHost, jubaPort) + }) + // return the futures of Jubatus and Updater + Right((jubaFut, updaterFut)) + case Failure(t) => + t match { + case _: IllegalArgumentException => + logger.warn(t.getMessage) + Left((400, t.getMessage)) + case _ => + val msg = "unable to create Updater: " + t.getMessage + logger.warn(msg) + Left((500, msg)) + } + + } + } else { + val msg = "source '%s' not found".format(sourceName) + logger.error(msg) + Left((400, msg)) + } + } + } + } + + protected def acceptsMoreStatements(dataSourceName: String): Boolean = { + sources.get(dataSourceName).map(_._1.state == Initialized).getOrElse(false) + } + + /** + * Run a function after ensuring the referenced stream exists and comes from a + * valid data source. + */ + protected def withStream(inputStreamName: String)(handler: String => + Either[(Int, String), JubaQLResponse]): Either[(Int, String), JubaQLResponse] = { + knownStreamNames.get(inputStreamName) match { + case Some(inputDataSourceName) => + sources.get(inputDataSourceName) match { + case Some((inputDataSource, _)) if inputDataSource.state == Initialized => + handler(inputDataSourceName) + case Some(_) => + val msg = s"data source '$inputDataSourceName' cannot accept further statements" + logger.warn(msg) + Left((400, msg)) + case None => + val msg = "data source with name '%s' does not exist".format(inputDataSourceName) + logger.error(msg) + Left((500, msg)) + } + case None => + val msg = "source '%s' not found".format(inputStreamName) + logger.error(msg) + Left((400, msg)) } } - protected def stopStreamProcessing(stopFun: () => (ProcessingInformation, ProcessingInformation)): - (ProcessingInformation, ProcessingInformation) = { + /** + * Run a function after ensuring all referenced streams exist and come from the + * same valid data source. + */ + protected def withStreams(inputStreamNames: Seq[String])(handler: String => + Either[(Int, String), JubaQLResponse]): Either[(Int, String), JubaQLResponse] = { + // look up which data source each stream comes from + val refDataSources = inputStreamNames.flatMap(knownStreamNames.get(_)).toSet + + // check if there are referenced streams that we don't know + (inputStreamNames.filter(!knownStreamNames.contains(_)), refDataSources.toList) match { + // all referenced streams are known and they come from just one data source + case (Nil, mainDataSource :: Nil) if acceptsMoreStatements(mainDataSource) => + handler(mainDataSource) + // data source is not in the correct state + case (Nil, mainDataSource :: Nil) => + val msg = s"data source '$mainDataSource' cannot accept further statements" + logger.warn(msg) + Left((400, msg)) + // all referenced streams are known, but they reference multiple data sources + case (Nil, other) => + val msg = "you cannot use streams from multiple different data sources in one statement" + logger.warn(msg) + Left((400, msg)) + // some referenced streams have not been seen before + case (unknownStreams, _) => + val msg = "unknown streams: %s".format(unknownStreams.mkString(", ")) + logger.warn(msg) + Left((400, msg)) + } + } + + protected def stopStreamProcessing(stopFun: () => (ProcessingInformation, ProcessingInformation), + forShutdown: Boolean): + (ProcessingInformation, ProcessingInformation) = { logger.info("stopping stream processing") // tell executors they should stop their processing - executorsShouldFinishProcessing.set(true) // NB. put() has different semantics + if (forShutdown) { + driverStatusMessage.set("shutdown") // NB. put() has different semantics + } else { + driverStatusMessage.set("stop-and-poll") // NB. put() has different semantics + } // the following call will block until processing is done completely val (staticInfo, streamInfo) = stopFun() logger.info("shut down successfully; processed %s/%s items".format( staticInfo.itemCount, streamInfo.itemCount )) + // if we are not executing a SHUTDOWN command, but a STOP PROCESSING + // command, we must reset state so that we can continue processing later + if (!forShutdown) { + driverStatusMessage.set("running") // NB. put() has different semantics + } (staticInfo, streamInfo) } protected def shutdownJubatus(modelName: String, app: JubatusYarnApplication) = { logger.info(s"shutting down model: $modelName") try { - app.stop() + // We have to wait here for the stop() call to complete. If we don't block + // until it is done, the main application may exit and kill this thread + // (this function is actually called from a future.map()) before Jubatus + // is stopped completely. + ScAwait.ready(app.stop(), 1 minute) logger.info(s"model $modelName shut down successfully") } catch { case e: Throwable => @@ -390,37 +1382,8 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) } } - protected def extractDatum(keys: List[String], data: String): Datum = { - extractDatum(keys, JsonMethods.parse(data)) - } - - protected def extractDatum(keys: List[String], jvalue: JValue): Datum = { - // filter unused filed - val filtered = jvalue.filterField { - case JField(key, _) => keys.indexOf(key) >= 0 - case _ => false - } - - var datum = new Datum - filtered.foreach({ - j => - val key = j._1 - j._2 match { - case JInt(v) => - datum.addNumber(key, v.toDouble) - case JDouble(v) => - datum.addNumber(key, v) - case JString(v) => - datum.addString(key, v) - case _ => - } - j - }) - return datum - } - - - protected def queryAnalyze(ana: Analyze): Option[String] = { + protected def queryAnalyze(ana: Analyze): Either[(Int, String), AnalyzeResult] = { + // TODO remove duplicated functionality with JubatusClient def datumToJson(datum: Datum): DatumResult = { DatumResult( datum.getStringValues().asScala.map(v => (v.key, v.value)).toMap, @@ -428,96 +1391,83 @@ class JubaQLService(sc: SparkContext, runMode: RunMode) ) } models.get(ana.modelName) match { - case Some((s, cm, LearningMachineType.Anomaly)) if ana.rpcName == "calc_score" => - val host = s.jubatusProxy.hostAddress - val port = s.jubatusProxy.port - val keys = cm.specifier.toMap.get("datum") match { - case Some(list) if list.nonEmpty => list - case _ => ??? // TODO: throw exception. datum not specified - } - var datum = extractDatum(keys, ana.data) - val anomaly = new AnomalyClient(host, port, ana.modelName, 5) - try { - val score = AnomalyScore(anomaly.calcScore(datum)) - implicit val formats = DefaultFormats - return Some(Serialization.write(score)) - } finally { - anomaly.getClient.close() - } + case Some((jubaApp, createModelStmt, machineType)) => + val host = jubaApp.jubatusProxy.hostAddress + val port = jubaApp.jubatusProxy.port - case Some((s, cm, LearningMachineType.Classifier)) if ana.rpcName == "classify" => - val host = s.jubatusProxy.hostAddress - val port = s.jubatusProxy.port - val keys = cm.specifier.toMap.get("datum") match { - case Some(list) if list.nonEmpty => list - case _ => ??? // TODO: throw exception. datum not specified - } - var datum = extractDatum(keys, ana.data) - val data = new java.util.LinkedList[Datum]() - data.add(datum) - val classifier = new ClassifierClient(host, port, ana.modelName, 5) - try { - val res = classifier.classify(data) - if (res.size() >= 1) { - // return in json format - val retValue = ClassifierResult(res.get(0).asScala.map({ - f => ClassifierPrediction(f.label, f.score) - }).toList) - implicit val formats = DefaultFormats - return Some(Serialization.write(retValue)) - } else { - // TODO: return error in json - } - } finally { - classifier.getClient().close() - } - case Some((s, cm, LearningMachineType.Recommender)) if (ana.rpcName == "complete_row_from_id" || - ana.rpcName == "complete_row_from_datum") => - val host = s.jubatusProxy.hostAddress - val port = s.jubatusProxy.port - ana.rpcName match { - case "complete_row_from_id" => + machineType match { + case LearningMachineType.Anomaly if ana.rpcName == "calc_score" => + val datum = DatumExtractor.extract(createModelStmt, ana.data, featureFunctions, logger) + val anomaly = new AnomalyClient(host, port, ana.modelName, 5) + try { + Right(AnomalyScore(anomaly.calcScore(datum))) + } finally { + anomaly.getClient.close() + } + + case LearningMachineType.Classifier if ana.rpcName == "classify" => + val datum = DatumExtractor.extract(createModelStmt, ana.data, featureFunctions, logger) + val data = new java.util.LinkedList[Datum]() + data.add(datum) + val classifier = new ClassifierClient(host, port, ana.modelName, 5) + try { + val res = classifier.classify(data) + if (res.size() >= 1) { + // return in json format + val retValue = ClassifierResult(res.get(0).asScala.map({ + f => ClassifierPrediction(f.label, f.score) + }).toList) + Right(retValue) + } else { + val msg = "got an empty result from classifier" + logger.error(msg) + Left((500, msg)) + } + } finally { + classifier.getClient.close() + } + + case LearningMachineType.Recommender if ana.rpcName == "complete_row_from_id" => val recommender = new RecommenderClient(host, port, ana.modelName, 5) try { val retDatum = datumToJson(recommender.completeRowFromId(ana.data)) - - implicit val formats = DefaultFormats - return Some(Serialization.write(retDatum)) + Right(retDatum) } finally { recommender.getClient().close() } - case "complete_row_from_datum" => - val keys = cm.specifier.toMap.get("datum") match { - case Some(list) if list.nonEmpty => list - case _ => ??? // TODO: throw exception. datum not specified - } - var datum = extractDatum(keys, ana.data) + case LearningMachineType.Recommender if ana.rpcName == "complete_row_from_datum" => + val datum = DatumExtractor.extract(createModelStmt, ana.data, featureFunctions, logger) val recommender = new RecommenderClient(host, port, ana.modelName, 5) - try { val retDatum = datumToJson(recommender.completeRowFromDatum(datum)) - - implicit val formats = DefaultFormats - return Some(Serialization.write(retDatum)) + Right(retDatum) } finally { - recommender.getClient().close() + recommender.getClient.close() } + case _ => + val msg = "cannot use model '%s' with method '%s'".format(ana.modelName, ana.rpcName) + logger.warn(msg) + Left((400, msg)) } - case _ => - // error - None + + case None => + val msg = "model '%s' does not exist".format(ana.modelName) + logger.warn(msg) + Left((400, msg)) } - None } } sealed trait RunMode object RunMode { + case class Production(zookeeper: List[(String, Int)]) extends RunMode + case object Development extends RunMode + } object LocalJubatusApplication extends LazyLogging { @@ -550,7 +1500,8 @@ object LocalJubatusApplication extends LazyLogging { val namedPipe = new java.io.File(namedPipePath) try { - val jubatusProcess = runtime.exec(s"$jubaCmdName -f $namedPipePath") + val rpcPort = findAvailablePort() + val jubatusProcess = runtime.exec(s"$jubaCmdName -p $rpcPort -f $namedPipePath") handleSubProcessOutput(jubatusProcess.getInputStream, System.out, jubaCmdName) handleSubProcessOutput(jubatusProcess.getErrorStream, System.err, jubaCmdName) val namedPipeWriter = new java.io.PrintWriter(namedPipe) @@ -560,7 +1511,8 @@ object LocalJubatusApplication extends LazyLogging { namedPipeWriter.close() } - new LocalJubatusApplication(jubatusProcess, aLearningMachineName, jubaCmdName) + new LocalJubatusApplication(jubatusProcess, aLearningMachineName, jubaCmdName, + rpcPort) } finally { namedPipe.delete() } @@ -595,11 +1547,27 @@ object LocalJubatusApplication extends LazyLogging { thread.setDaemon(true) thread.start() } + + protected def findAvailablePort(): Int = { + // connect to ports until we fail to connect to one + Stream.from(9199).filter(port => { + try { + val socket = new java.net.Socket("127.0.0.1", port) + socket.close() + false + } catch { + case e: java.net.ConnectException => + true + case e: Throwable => + false + } + }).head + } } // LocalJubatusApplication is not a JubatusYarnApplication, but extends JubatusYarnApplication for implementation. -class LocalJubatusApplication(jubatus: Process, name: String, jubaCmdName: String) - extends JubatusYarnApplication(Location(InetAddress.getLocalHost, 9199), List(), null) { +class LocalJubatusApplication(jubatus: Process, name: String, jubaCmdName: String, port: Int = 9199) + extends JubatusYarnApplication(Location(InetAddress.getLocalHost, port), List(), null) { override def status: JubatusYarnApplicationStatus = { throw new NotImplementedError("status is not implemented") diff --git a/processor/src/main/scala/us/jubat/jubaql_server/processor/PreparedJubaQLStatement.scala b/processor/src/main/scala/us/jubat/jubaql_server/processor/PreparedJubaQLStatement.scala new file mode 100644 index 0000000..2685735 --- /dev/null +++ b/processor/src/main/scala/us/jubat/jubaql_server/processor/PreparedJubaQLStatement.scala @@ -0,0 +1,60 @@ +// Jubatus: Online machine learning framework for distributed environment +// Copyright (C) 2014-2015 Preferred Networks and Nippon Telegraph and Telephone Corporation. +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License version 2.1 as published by the Free Software Foundation. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +package us.jubat.jubaql_server.processor + +import org.apache.spark.sql.DataType +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.types.StructType +import scala.concurrent.Future +import us.jubat.yarn.client.JubatusYarnApplication +import us.jubat.jubaql_server.processor.updater.JubatusClient + +sealed trait PreparedJubaQLStatement + +case class PreparedUpdate(modelName: String, + modelFut: Future[JubatusYarnApplication], + dataSourceName: String, + updaterFut: Future[JubatusClient]) extends PreparedJubaQLStatement + +case class PreparedCreateStreamFromSelect(streamName: String, + selectPlan: LogicalPlan, + usedTables: List[String]) extends PreparedJubaQLStatement { + override def toString: String = { + "PreparedCreateStreamFromSelect(%s,