Skip to content

Commit

Permalink
[SPARK-50933][ML][PYTHON][CONNECT] Support Feature Selectors on Connect
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Support Feature Selectors on Connect:

- ChiSqSelector
- UnivariateFeatureSelector
- VarianceThresholdSelector

### Why are the changes needed?
feature parity

### Does this PR introduce _any_ user-facing change?
yes, new algorithms supported on connect

### How was this patch tested?
added tests

### Was this patch authored or co-authored using generative AI tooling?
no

Closes #49641 from zhengruifeng/ml_connect_selector.

Authored-by: Ruifeng Zheng <[email protected]>
Signed-off-by: Ruifeng Zheng <[email protected]>
  • Loading branch information
zhengruifeng committed Jan 24, 2025
1 parent 4e3b831 commit e887f05
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ org.apache.spark.ml.feature.StandardScaler
org.apache.spark.ml.feature.MaxAbsScaler
org.apache.spark.ml.feature.MinMaxScaler
org.apache.spark.ml.feature.RobustScaler
org.apache.spark.ml.feature.ChiSqSelector
org.apache.spark.ml.feature.UnivariateFeatureSelector
org.apache.spark.ml.feature.VarianceThresholdSelector
org.apache.spark.ml.feature.StringIndexer
org.apache.spark.ml.feature.PCA
org.apache.spark.ml.feature.Word2Vec
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ org.apache.spark.ml.feature.StandardScalerModel
org.apache.spark.ml.feature.MaxAbsScalerModel
org.apache.spark.ml.feature.MinMaxScalerModel
org.apache.spark.ml.feature.RobustScalerModel
org.apache.spark.ml.feature.ChiSqSelectorModel
org.apache.spark.ml.feature.UnivariateFeatureSelectorModel
org.apache.spark.ml.feature.VarianceThresholdSelectorModel
org.apache.spark.ml.feature.StringIndexerModel
org.apache.spark.ml.feature.PCAModel
org.apache.spark.ml.feature.Word2VecModel
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,9 @@ final class ChiSqSelectorModel private[ml] (

import ChiSqSelectorModel._

private[ml] def this() = this(
Identifiable.randomUID("chiSqSelector"), Array.emptyIntArray)

override protected def isNumericAttribute = false

/** @group setParam */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,9 @@ class UnivariateFeatureSelectorModel private[ml](
extends Model[UnivariateFeatureSelectorModel] with UnivariateFeatureSelectorParams
with MLWritable {

private[ml] def this() = this(
Identifiable.randomUID("UnivariateFeatureSelector"), Array.emptyIntArray)

/** @group setParam */
@Since("3.1.1")
def setFeaturesCol(value: String): this.type = set(featuresCol, value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ class VarianceThresholdSelectorModel private[ml](
extends Model[VarianceThresholdSelectorModel] with VarianceThresholdSelectorParams
with MLWritable {

private[ml] def this() = this(
Identifiable.randomUID("VarianceThresholdSelector"), Array.emptyIntArray)

if (selectedFeatures.length >= 2) {
require(selectedFeatures.sliding(2).forall(l => l(0) < l(1)),
"Index should be strictly increasing.")
Expand Down
102 changes: 102 additions & 0 deletions python/pyspark/ml/tests/test_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@
MinMaxScalerModel,
RobustScaler,
RobustScalerModel,
ChiSqSelector,
ChiSqSelectorModel,
UnivariateFeatureSelector,
UnivariateFeatureSelectorModel,
VarianceThresholdSelector,
VarianceThresholdSelectorModel,
StopWordsRemover,
StringIndexer,
StringIndexerModel,
Expand Down Expand Up @@ -391,6 +397,102 @@ def test_robust_scaler(self):
self.assertEqual(str(model), str(model2))
self.assertEqual(model2.getOutputCol(), "scaled")

def test_chi_sq_selector(self):
df = self.spark.createDataFrame(
[
(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0),
],
["features", "label"],
)

selector = ChiSqSelector(numTopFeatures=1, outputCol="selectedFeatures")
self.assertEqual(selector.getNumTopFeatures(), 1)
self.assertEqual(selector.getOutputCol(), "selectedFeatures")

model = selector.fit(df)
self.assertEqual(model.selectedFeatures, [2])

output = model.transform(df)
self.assertEqual(output.columns, ["features", "label", "selectedFeatures"])
self.assertEqual(output.count(), 3)

# save & load
with tempfile.TemporaryDirectory(prefix="chi_sq_selector") as d:
selector.write().overwrite().save(d)
selector2 = ChiSqSelector.load(d)
self.assertEqual(str(selector), str(selector2))

model.write().overwrite().save(d)
model2 = ChiSqSelectorModel.load(d)
self.assertEqual(str(model), str(model2))

def test_univariate_selector(self):
df = self.spark.createDataFrame(
[
(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0),
],
["features", "label"],
)

selector = UnivariateFeatureSelector(outputCol="selectedFeatures")
selector.setFeatureType("continuous").setLabelType("categorical").setSelectionThreshold(1)
self.assertEqual(selector.getFeatureType(), "continuous")
self.assertEqual(selector.getLabelType(), "categorical")
self.assertEqual(selector.getOutputCol(), "selectedFeatures")
self.assertEqual(selector.getSelectionThreshold(), 1)

model = selector.fit(df)
self.assertEqual(model.selectedFeatures, [3])

output = model.transform(df)
self.assertEqual(output.columns, ["features", "label", "selectedFeatures"])
self.assertEqual(output.count(), 3)

# save & load
with tempfile.TemporaryDirectory(prefix="univariate_selector") as d:
selector.write().overwrite().save(d)
selector2 = UnivariateFeatureSelector.load(d)
self.assertEqual(str(selector), str(selector2))

model.write().overwrite().save(d)
model2 = UnivariateFeatureSelectorModel.load(d)
self.assertEqual(str(model), str(model2))

def test_variance_threshold_selector(self):
df = self.spark.createDataFrame(
[
(Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0),
(Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0),
(Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0),
],
["features", "label"],
)

selector = VarianceThresholdSelector(varianceThreshold=2, outputCol="selectedFeatures")
self.assertEqual(selector.getVarianceThreshold(), 2)
self.assertEqual(selector.getOutputCol(), "selectedFeatures")

model = selector.fit(df)
self.assertEqual(model.selectedFeatures, [2])

output = model.transform(df)
self.assertEqual(output.columns, ["features", "label", "selectedFeatures"])
self.assertEqual(output.count(), 3)

# save & load
with tempfile.TemporaryDirectory(prefix="variance_threshold_selector") as d:
selector.write().overwrite().save(d)
selector2 = VarianceThresholdSelector.load(d)
self.assertEqual(str(selector), str(selector2))

model.write().overwrite().save(d)
model2 = VarianceThresholdSelectorModel.load(d)
self.assertEqual(str(model), str(model2))

def test_word2vec(self):
sent = ("a b " * 100 + "a c " * 10).split(" ")
df = self.spark.createDataFrame([(sent,), (sent,)], ["sentence"]).coalesce(1)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,9 @@ private[ml] object MLUtils {
(classOf[MaxAbsScalerModel], Set("maxAbs")),
(classOf[MinMaxScalerModel], Set("originalMax", "originalMin")),
(classOf[RobustScalerModel], Set("range", "median")),
(classOf[ChiSqSelectorModel], Set("selectedFeatures")),
(classOf[UnivariateFeatureSelectorModel], Set("selectedFeatures")),
(classOf[VarianceThresholdSelectorModel], Set("selectedFeatures")),
(classOf[PCAModel], Set("pc", "explainedVariance")),
(classOf[Word2VecModel], Set("getVectors", "findSynonyms", "findSynonymsArray")))

Expand Down

0 comments on commit e887f05

Please sign in to comment.