Merge remote-tracking branch 'upstream/master' into isLargerBetter

apache · Jan 24, 2025 · a28d7f1 · a28d7f1
2 parents a948963 + 7c316f7
commit a28d7f1
Show file tree

Hide file tree

Showing 28 changed files with 666 additions and 129 deletions.
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -82,6 +82,12 @@
     ],
     "sqlState" : "22003"
   },
+  "ARTIFACT_ALREADY_EXISTS" : {
+    "message" : [
+      "The artifact <normalizedRemoteRelativePath> already exists. Please choose a different name for the new artifact because it cannot be overwritten."
+    ],
+    "sqlState" : "42713"
+  },
   "ASSIGNMENT_ARITY_MISMATCH" : {
     "message" : [
       "The number of columns or variables assigned or aliased: <numTarget> does not match the number of source expressions: <numExpr>."

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala
@@ -22,12 +22,10 @@ import com.google.protobuf.{BoolValue, BytesValue, DoubleValue, FloatValue, Int3
 import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor}
 import com.google.protobuf.WireFormat
 
-import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.types._
 
-@DeveloperApi
 object SchemaConverters extends Logging {
 
   /**
@@ -42,13 +40,13 @@ object SchemaConverters extends Logging {
    *
    * @since 3.4.0
    */
-  def toSqlType(
+  private[protobuf] def toSqlType(
       descriptor: Descriptor,
       protobufOptions: ProtobufOptions = ProtobufOptions(Map.empty)): SchemaType = {
     toSqlTypeHelper(descriptor, protobufOptions)
   }
 
-  def toSqlTypeHelper(
+  private[protobuf] def toSqlTypeHelper(
       descriptor: Descriptor,
       protobufOptions: ProtobufOptions): SchemaType = {
     val fields = descriptor.getFields.asScala.flatMap(
@@ -65,7 +63,7 @@ object SchemaConverters extends Logging {
   // exceed the maximum recursive depth specified by the recursiveFieldMaxDepth option.
   // A return of None implies the field has reached the maximum allowed recursive depth and
   // should be dropped.
-  def structFieldFor(
+  private def structFieldFor(
       fd: FieldDescriptor,
       existingRecordNames: Map[String, Int],
       protobufOptions: ProtobufOptions): Option[StructField] = {

diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -1126,6 +1126,7 @@ def __hash__(self):
         "pyspark.ml.tests.connect.test_parity_clustering",
         "pyspark.ml.tests.connect.test_parity_evaluation",
         "pyspark.ml.tests.connect.test_parity_feature",
+        "pyspark.ml.tests.connect.test_parity_pipeline",
     ],
     excluded_python_implementations=[
         "PyPy"  # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and

diff --git a/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala b/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vectors.scala
@@ -240,7 +240,7 @@ sealed trait Vector extends Serializable {
 @Since("2.0.0")
 object Vectors {
 
-  private[ml] val empty: Vector = zeros(0)
+  private[ml] val empty: DenseVector = new DenseVector(Array.emptyDoubleArray)
 
   /**
    * Creates a dense vector from its values.

diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Estimator
@@ -19,6 +19,7 @@
 # So register the supported estimator here if you're trying to add a new one.
 
 # classification
+org.apache.spark.ml.classification.LinearSVC
 org.apache.spark.ml.classification.LogisticRegression
 org.apache.spark.ml.classification.DecisionTreeClassifier
 org.apache.spark.ml.classification.RandomForestClassifier
@@ -52,3 +53,4 @@ org.apache.spark.ml.feature.MinMaxScaler
 org.apache.spark.ml.feature.RobustScaler
 org.apache.spark.ml.feature.StringIndexer
 org.apache.spark.ml.feature.PCA
+org.apache.spark.ml.feature.Word2Vec
diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer
@@ -18,10 +18,16 @@
 # Spark Connect ML uses ServiceLoader to find out the supported Spark Ml non-model transformer.
 # So register the supported transformer here if you're trying to add a new one.
 ########### Transformers
+org.apache.spark.ml.feature.DCT
 org.apache.spark.ml.feature.VectorAssembler
+org.apache.spark.ml.feature.Tokenizer
+org.apache.spark.ml.feature.RegexTokenizer
+org.apache.spark.ml.feature.SQLTransformer
+org.apache.spark.ml.feature.StopWordsRemover
 
 ########### Model for loading
 # classification
+org.apache.spark.ml.classification.LinearSVCModel
 org.apache.spark.ml.classification.LogisticRegressionModel
 org.apache.spark.ml.classification.DecisionTreeClassificationModel
 org.apache.spark.ml.classification.RandomForestClassificationModel
@@ -50,3 +56,4 @@ org.apache.spark.ml.feature.MinMaxScalerModel
 org.apache.spark.ml.feature.RobustScalerModel
 org.apache.spark.ml.feature.StringIndexerModel
 org.apache.spark.ml.feature.PCAModel
+org.apache.spark.ml.feature.Word2VecModel
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -365,6 +365,8 @@ class LinearSVCModel private[classification] (
   extends ClassificationModel[Vector, LinearSVCModel]
   with LinearSVCParams with MLWritable with HasTrainingSummary[LinearSVCTrainingSummary] {
 
+  private[ml] def this() = this(Identifiable.randomUID("linearsvc"), Vectors.empty, 0.0)
+
   @Since("2.2.0")
   override val numClasses: Int = 2
 

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -130,7 +130,7 @@ class PCAModel private[ml] (
   // For ml connect only
   @Since("4.0.0")
   private[ml] def this() = this(Identifiable.randomUID("pca"),
-    DenseMatrix.zeros(1, 1), Vectors.empty.asInstanceOf[DenseVector])
+    DenseMatrix.zeros(1, 1), Vectors.empty)
 
   /** @group setParam */
   @Since("1.5.0")

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -211,6 +211,8 @@ class Word2VecModel private[ml] (
 
   import Word2VecModel._
 
+  private[ml] def this() = this(Identifiable.randomUID("w2v"), null)
+
   /**
    * Returns a dataframe with two fields, "word" and "vector", with "word" being a String and
    * and the vector the DenseVector that it is mapped to.

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -62,7 +62,6 @@
     HasSolver,
     HasParallelism,
 )
-from pyspark.ml.util import try_remote_attribute_relation
 from pyspark.ml.tree import (
     _DecisionTreeModel,
     _DecisionTreeParams,
@@ -86,6 +85,7 @@
     MLWriter,
     MLWritable,
     HasTrainingSummary,
+    try_remote_attribute_relation,
 )
 from pyspark.ml.wrapper import JavaParams, JavaPredictor, JavaPredictionModel, JavaWrapper
 from pyspark.ml.common import inherit_doc