From edf1f3d767353ecaaa49b934a4599dcbaab99750 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 23 Jan 2025 18:38:04 +0800 Subject: [PATCH 1/2] init --- .../org/apache/spark/ml/linalg/Vectors.scala | 2 +- .../services/org.apache.spark.ml.Estimator | 1 + .../services/org.apache.spark.ml.Transformer | 1 + .../spark/ml/classification/LinearSVC.scala | 2 + .../org/apache/spark/ml/feature/PCA.scala | 2 +- python/pyspark/ml/classification.py | 2 +- .../pyspark/ml/tests/test_classification.py | 76 +++++++++++++++++++ .../apache/spark/sql/connect/ml/MLUtils.scala | 1 + 8 files changed, 84 insertions(+), 3 deletions(-) diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala index 94548afb6c292..a86cd2c994498 100644 --- a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala +++ b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala @@ -240,7 +240,7 @@ sealed trait Vector extends Serializable { @Since("2.0.0") object Vectors { - private[ml] val empty: Vector = zeros(0) + private[ml] val empty: DenseVector = new DenseVector(Array.emptyDoubleArray) /** * Creates a dense vector from its values. diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator index 6c5bbd858d9cc..d4fceadeea2f0 100644 --- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator +++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator @@ -19,6 +19,7 @@ # So register the supported estimator here if you're trying to add a new one. # classification +org.apache.spark.ml.classification.LinearSVC org.apache.spark.ml.classification.LogisticRegression org.apache.spark.ml.classification.DecisionTreeClassifier org.apache.spark.ml.classification.RandomForestClassifier diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer index 0448117468198..456bbc0e14299 100644 --- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer +++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer @@ -22,6 +22,7 @@ org.apache.spark.ml.feature.VectorAssembler ########### Model for loading # classification +org.apache.spark.ml.classification.LinearSVCModel org.apache.spark.ml.classification.LogisticRegressionModel org.apache.spark.ml.classification.DecisionTreeClassificationModel org.apache.spark.ml.classification.RandomForestClassificationModel diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala index 161e8f4cbd2c5..6fa7f4d5d493c 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala @@ -365,6 +365,8 @@ class LinearSVCModel private[classification] ( extends ClassificationModel[Vector, LinearSVCModel] with LinearSVCParams with MLWritable with HasTrainingSummary[LinearSVCTrainingSummary] { + private[ml] def this() = this(Identifiable.randomUID("linearsvc"), Vectors.empty, 0.0) + @Since("2.2.0") override val numClasses: Int = 2 diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala index 7d630233eb0a4..67c8fcf15eec2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala @@ -130,7 +130,7 @@ class PCAModel private[ml] ( // For ml connect only @Since("4.0.0") private[ml] def this() = this(Identifiable.randomUID("pca"), - DenseMatrix.zeros(1, 1), Vectors.empty.asInstanceOf[DenseVector]) + DenseMatrix.zeros(1, 1), Vectors.empty) /** @group setParam */ @Since("1.5.0") diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index fa51d88283403..d8ed51a82abe9 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -62,7 +62,6 @@ HasSolver, HasParallelism, ) -from pyspark.ml.util import try_remote_attribute_relation from pyspark.ml.tree import ( _DecisionTreeModel, _DecisionTreeParams, @@ -86,6 +85,7 @@ MLWriter, MLWritable, HasTrainingSummary, + try_remote_attribute_relation, ) from pyspark.ml.wrapper import JavaParams, JavaPredictor, JavaPredictionModel, JavaWrapper from pyspark.ml.common import inherit_doc diff --git a/python/pyspark/ml/tests/test_classification.py b/python/pyspark/ml/tests/test_classification.py index 8afa7327fea16..cff8e18057f29 100644 --- a/python/pyspark/ml/tests/test_classification.py +++ b/python/pyspark/ml/tests/test_classification.py @@ -24,6 +24,10 @@ from pyspark.ml.linalg import Vectors, Matrices from pyspark.sql import SparkSession, DataFrame from pyspark.ml.classification import ( + LinearSVC, + LinearSVCModel, + LinearSVCSummary, + LinearSVCTrainingSummary, LogisticRegression, LogisticRegressionModel, LogisticRegressionSummary, @@ -299,6 +303,78 @@ def test_logistic_regression(self): except OSError: pass + def test_linear_svc(self): + df = ( + self.spark.createDataFrame( + [ + (1.0, 1.0, Vectors.dense(0.0, 5.0)), + (0.0, 2.0, Vectors.dense(1.0, 2.0)), + (1.0, 3.0, Vectors.dense(2.0, 1.0)), + (0.0, 4.0, Vectors.dense(3.0, 3.0)), + ], + ["label", "weight", "features"], + ) + .coalesce(1) + .sortWithinPartitions("weight") + ) + + svc = LinearSVC(maxIter=1, regParam=1.0) + self.assertEqual(svc.getMaxIter(), 1) + self.assertEqual(svc.getRegParam(), 1.0) + + model = svc.fit(df) + self.assertEqual(model.numClasses, 2) + self.assertEqual(model.numFeatures, 2) + self.assertTrue(np.allclose(model.intercept, 0.025877458475338313, atol=1e-4)) + self.assertTrue( + np.allclose(model.coefficients.toArray(), [-0.03622844, 0.01035098], atol=1e-4) + ) + + vec = Vectors.dense(0.0, 5.0) + self.assertEqual(model.predict(vec), 1.0) + self.assertTrue( + np.allclose(model.predictRaw(vec).toArray(), [-0.07763238, 0.07763238], atol=1e-4) + ) + + output = model.transform(df) + expected_cols = [ + "label", + "weight", + "features", + "rawPrediction", + "prediction", + ] + self.assertEqual(output.columns, expected_cols) + self.assertEqual(output.count(), 4) + + # model summary + self.assertTrue(model.hasSummary) + summary = model.summary() + self.assertIsInstance(summary, LinearSVCSummary) + self.assertIsInstance(summary, LinearSVCTrainingSummary) + self.assertEqual(summary.labels, [0.0, 1.0]) + self.assertEqual(summary.accuracy, 0.5) + self.assertEqual(summary.areaUnderROC, 0.75) + self.assertEqual(summary.predictions.columns, expected_cols) + + summary2 = model.evaluate(df) + self.assertIsInstance(summary2, LinearSVCSummary) + self.assertFalse(isinstance(summary2, LinearSVCTrainingSummary)) + self.assertEqual(summary2.labels, [0.0, 1.0]) + self.assertEqual(summary2.accuracy, 0.5) + self.assertEqual(summary2.areaUnderROC, 0.75) + self.assertEqual(summary2.predictions.columns, expected_cols) + + # Model save & load + with tempfile.TemporaryDirectory(prefix="binary_random_forest") as d: + svc.write().overwrite().save(d) + svc2 = LinearSVC.load(d) + self.assertEqual(str(svc), str(svc2)) + + model.write().overwrite().save(d) + model2 = LinearSVCModel.load(d) + self.assertEqual(str(model), str(model2)) + def test_decision_tree_classifier(self): df = ( self.spark.createDataFrame( diff --git a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala index dd961a3415cb5..926fc23621634 100644 --- a/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala +++ b/sql/connect/server/src/main/scala/org/apache/spark/sql/connect/ml/MLUtils.scala @@ -526,6 +526,7 @@ private[ml] object MLUtils { (classOf[GBTRegressionModel], Set("featureImportances", "evaluateEachIteration")), // Classification Models + (classOf[LinearSVCModel], Set("intercept", "coefficients", "evaluate")), ( classOf[LogisticRegressionModel], Set("intercept", "coefficients", "interceptVector", "coefficientMatrix", "evaluate")), From a2656886de9c15d18b317af4c8ab37dfb512b084 Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Thu, 23 Jan 2025 18:38:27 +0800 Subject: [PATCH 2/2] init --- python/pyspark/ml/tests/test_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/ml/tests/test_classification.py b/python/pyspark/ml/tests/test_classification.py index cff8e18057f29..bcf376007198a 100644 --- a/python/pyspark/ml/tests/test_classification.py +++ b/python/pyspark/ml/tests/test_classification.py @@ -366,7 +366,7 @@ def test_linear_svc(self): self.assertEqual(summary2.predictions.columns, expected_cols) # Model save & load - with tempfile.TemporaryDirectory(prefix="binary_random_forest") as d: + with tempfile.TemporaryDirectory(prefix="linear_svc") as d: svc.write().overwrite().save(d) svc2 = LinearSVC.load(d) self.assertEqual(str(svc), str(svc2))