diff --git a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer index d2ba928af47ae..84eebb21f9e23 100644 --- a/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer +++ b/mllib/src/main/resources/META-INF/services/org.apache.spark.ml.Transformer @@ -19,6 +19,7 @@ # So register the supported transformer here if you're trying to add a new one. ########### Transformers org.apache.spark.ml.feature.DCT +org.apache.spark.ml.feature.Binarizer org.apache.spark.ml.feature.VectorAssembler org.apache.spark.ml.feature.Tokenizer org.apache.spark.ml.feature.RegexTokenizer diff --git a/python/pyspark/ml/tests/connect/test_parity_feature.py b/python/pyspark/ml/tests/connect/test_parity_feature.py index c49faaf18e735..55d299c063708 100644 --- a/python/pyspark/ml/tests/connect/test_parity_feature.py +++ b/python/pyspark/ml/tests/connect/test_parity_feature.py @@ -22,10 +22,6 @@ class FeatureParityTests(FeatureTestsMixin, ReusedConnectTestCase): - @unittest.skip("Need to support.") - def test_binarizer(self): - super().test_binarizer() - @unittest.skip("Need to support.") def test_idf(self): super().test_idf() diff --git a/python/pyspark/ml/tests/test_feature.py b/python/pyspark/ml/tests/test_feature.py index 16480b515a136..7a727bcebc6bf 100644 --- a/python/pyspark/ml/tests/test_feature.py +++ b/python/pyspark/ml/tests/test_feature.py @@ -574,6 +574,46 @@ def test_binarizer(self): self.assertEqual(b1.getInputCol(), "input") self.assertEqual(b1.getOutputCol(), "output") + df = self.spark.createDataFrame( + [ + (0.1, 0.0), + (0.4, 1.0), + (1.2, 1.3), + (1.5, float("nan")), + (float("nan"), 1.0), + (float("nan"), 0.0), + ], + ["v1", "v2"], + ) + + bucketizer = Binarizer(threshold=1.0, inputCol="v1", outputCol="f1") + output = bucketizer.transform(df) + self.assertEqual(output.columns, ["v1", "v2", "f1"]) + self.assertEqual(output.count(), 6) + self.assertEqual( + [r.f1 for r in output.select("f1").collect()], + [0.0, 0.0, 1.0, 1.0, 0.0, 0.0], + ) + + bucketizer = Binarizer(threshold=1.0, inputCols=["v1", "v2"], outputCols=["f1", "f2"]) + output = bucketizer.transform(df) + self.assertEqual(output.columns, ["v1", "v2", "f1", "f2"]) + self.assertEqual(output.count(), 6) + self.assertEqual( + [r.f1 for r in output.select("f1").collect()], + [0.0, 0.0, 1.0, 1.0, 0.0, 0.0], + ) + self.assertEqual( + [r.f2 for r in output.select("f2").collect()], + [0.0, 0.0, 1.0, 0.0, 0.0, 0.0], + ) + + # save & load + with tempfile.TemporaryDirectory(prefix="binarizer") as d: + bucketizer.write().overwrite().save(d) + bucketizer2 = Binarizer.load(d) + self.assertEqual(str(bucketizer), str(bucketizer2)) + def test_idf(self): dataset = self.spark.createDataFrame( [(DenseVector([1.0, 2.0]),), (DenseVector([0.0, 1.0]),), (DenseVector([3.0, 0.2]),)],