Skip to content

Commit

Permalink
Fixed build warnings due to upcoming Scala 3 changes and previously d…
Browse files Browse the repository at this point in the history
…eprecated methods. (#52)
  • Loading branch information
jverbus authored May 30, 2024
1 parent b5dc70a commit 73a4a45
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores

// Validate $(maxFeatures) and $(maxSamples) against input dataset and determine the values
// actually used to train the model: numFeatures and numSamples
val totalNumFeatures = dataset.head.features.length
val totalNumFeatures = dataset.head().features.length
val numFeatures = if ($(maxFeatures) > 1.0) {
math.floor($(maxFeatures)).toInt
} else {
Expand Down Expand Up @@ -94,7 +94,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
.partitionBy(new HashPartitioner($(numEstimators)))
val repartitionedFlattenedSampledDataset = repartitionedFlattenedSampledRdd
.mapPartitions(x => x.map(y => y._2), preservesPartitioning = true)
.toDS
.toDS()
logInfo(s"Training ${$(numEstimators)} isolation trees using" +
s" ${repartitionedFlattenedSampledDataset.rdd.getNumPartitions} partitions.")

Expand All @@ -106,7 +106,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
// Use a different seed for each partition to ensure each partition has an independent set of
// random numbers. This ensures each tree is truly trained independently and doing so has a
// measurable effect on the results.
val seed = $(randomSeed) + TaskContext.get.partitionId() + 2
val seed = $(randomSeed) + TaskContext.get().partitionId() + 2
val rnd = new scala.util.Random(seed)

val dataForTree = rnd.shuffle(x.toSeq).slice(0, numSamples).toArray
Expand All @@ -124,7 +124,7 @@ class IsolationForest(override val uid: String) extends Estimator[IsolationFores
// random numbers. This ensures each tree is truly trained independently and doing so has a
// measurable effect on the results.
Iterator(IsolationTree
.fit(dataForTree, $(randomSeed) + $(numEstimators) + TaskContext.get.partitionId() + 2, featureIndices))
.fit(dataForTree, $(randomSeed) + $(numEstimators) + TaskContext.get().partitionId() + 2, featureIndices))
}).collect()

val isolationForestModel = copyValues(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ private[isolationforest] case object IsolationForestModelReadWrite extends Loggi

saveMetadata(instance, path, spark, Some(extraMetadata))
val dataPath = new Path(path, "data").toString
val nodeDataRDD = spark.sparkContext.parallelize(instance.isolationTrees.zipWithIndex)
val nodeDataRDD = spark.sparkContext.parallelize(instance.isolationTrees.zipWithIndex.toIndexedSeq)
.flatMap { case (tree, treeID) => EnsembleNodeData.build(tree, treeID) }
logInfo(s"Saving IsolationForestModel tree data to path ${dataPath}")
spark.createDataFrame(nodeDataRDD)
Expand Down Expand Up @@ -299,7 +299,7 @@ private[isolationforest] case object IsolationForestModelReadWrite extends Loggi

val uid = instance.uid
val cls = instance.getClass.getName
val params = instance.extractParamMap.toSeq
val params = instance.extractParamMap().toSeq
val jsonParams = render(params.map { case ParamPair(p, v) =>
p.name -> parse(p.jsonEncode(v))
}.toList)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ private[isolationforest] case object IsolationTree extends Logging {
def fit(data: Array[DataPoint], randomSeed: Long, featureIndices: Array[Int]): IsolationTree = {

logInfo(s"Fitting isolation tree with random seed ${randomSeed} on" +
s" ${featureIndices.seq.toString} features (indices) using ${data.length} data points.")
s" ${featureIndices.toIndexedSeq.toString} features (indices) using ${data.length} data points.")

def log2(x: Double): Double = math.log10(x) / math.log10(2.0)
val heightLimit = math.ceil(log2(data.length.toDouble)).toInt
Expand Down Expand Up @@ -124,7 +124,7 @@ private[isolationforest] case object IsolationTree extends Logging {
if (minFeatureValue != maxFeatureValue) {
foundFeature = true
featureIndex = featureIndexTrial
featureSplitValue = ((maxFeatureValue - minFeatureValue) * randomState.nextDouble
featureSplitValue = ((maxFeatureValue - minFeatureValue) * randomState.nextDouble()
+ minFeatureValue)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class BaggedPointTest {
numCols: Int,
expectedMean: Double,
expectedStddev: Double,
epsilon: Double) {
epsilon: Double): Unit = {

val values = new mutable.ArrayBuffer[Double]()
data.foreach { row =>
Expand All @@ -63,7 +63,7 @@ class BaggedPointTest {
val spark = getSparkSession

val dataPointArray = generateDataPoints(1, 1000)
val rdd = spark.sparkContext.parallelize(dataPointArray)
val rdd = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq)
val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, 1, false, 42)
baggedRDD.collect().foreach { baggedPoint =>
assert(baggedPoint.subsampleWeights.size == 1 && baggedPoint.subsampleWeights(0) == 1)
Expand All @@ -80,7 +80,7 @@ class BaggedPointTest {

val seeds = Array(123, 5354, 230, 349867, 23987)
val arr = generateDataPoints(1, 1000)
val rdd = spark.sparkContext.parallelize(arr)
val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
seeds.foreach { seed =>
val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, true, seed)
val subsampleCounts: Array[Array[Double]] = baggedRDD
Expand All @@ -101,7 +101,7 @@ class BaggedPointTest {

val seeds = Array(123, 5354, 230, 349867, 23987)
val arr = generateDataPoints(1, 1000)
val rdd = spark.sparkContext.parallelize(arr)
val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
seeds.foreach { seed =>
val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, true, seed)
val subsampleCounts: Array[Array[Double]] = baggedRDD
Expand All @@ -121,7 +121,7 @@ class BaggedPointTest {

val seeds = Array(123, 5354, 230, 349867, 23987)
val arr = generateDataPoints(1, 1000)
val rdd = spark.sparkContext.parallelize(arr)
val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
seeds.foreach { seed =>
val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, 1.0, numSubsamples, false, seed)
val subsampleCounts: Array[Array[Double]] = baggedRDD
Expand All @@ -142,7 +142,7 @@ class BaggedPointTest {

val seeds = Array(123, 5354, 230, 349867, 23987)
val arr = generateDataPoints(1, 1000)
val rdd = spark.sparkContext.parallelize(arr)
val rdd = spark.sparkContext.parallelize(arr.toIndexedSeq)
seeds.foreach { seed =>
val baggedRDD = BaggedPoint.convertToBaggedRDD(rdd, subsample, numSubsamples, false, seed)
val subsampleCounts: Array[Array[Double]] = baggedRDD
Expand All @@ -168,7 +168,7 @@ class BaggedPointTest {
(1, dataPointArray(1)),
(1, dataPointArray(1)))

val dataPointRDD = spark.sparkContext.parallelize(dataPointArray)
val dataPointRDD = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq)
val baggedPointRDD = dataPointRDD.map(x => new BaggedPoint(x, subsampleWeights))
val flattenedBaggedPointRDD = BaggedPoint.flattenBaggedRDD(baggedPointRDD, 1L)
val flattenedBaggedPointArray = flattenedBaggedPointRDD.collect()
Expand All @@ -187,7 +187,7 @@ class BaggedPointTest {
val dataPointArray = generateDataPoints(10, numRecords)
val subsampleWeights = Array(1.3, 1.75)

val dataPointRDD = spark.sparkContext.parallelize(dataPointArray)
val dataPointRDD = spark.sparkContext.parallelize(dataPointArray.toIndexedSeq)
val baggedPointRDD = dataPointRDD.map(x => new BaggedPoint(x, subsampleWeights))
val flattenedBaggedPointRDD = BaggedPoint.flattenBaggedRDD(baggedPointRDD, 1L)
val flattenedBaggedPointArray = flattenedBaggedPointRDD.collect()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,14 @@ class IsolationForestModelWriteReadTest extends Logging {

// Write the trained model to disk and then read it back from disk
val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModel"
isolationForestModel1.write.overwrite.save(savePath)
isolationForestModel1.write.overwrite().save(savePath)
val isolationForestModel2 = IsolationForestModel.load(savePath)
deleteDirectory(new File(savePath))

// Assert that all parameter values are equal
Assert.assertEquals(
isolationForestModel1.extractParamMap.toString,
isolationForestModel2.extractParamMap.toString)
isolationForestModel1.extractParamMap().toString,
isolationForestModel2.extractParamMap().toString)
Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples)
Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures)
Assert.assertEquals(
Expand All @@ -64,8 +64,8 @@ class IsolationForestModelWriteReadTest extends Logging {
Assert.assertEquals(auroc1, auroc2)

// Assert the predicted labels are equal
val predictedLabels1 = scores1.map(x => x.predictedLabel).collect
val predictedLabels2 = scores2.map(x => x.predictedLabel).collect
val predictedLabels1 = scores1.map(x => x.predictedLabel).collect()
val predictedLabels2 = scores2.map(x => x.predictedLabel).collect()
Assert.assertEquals(predictedLabels1.toSeq, predictedLabels2.toSeq)

// Compare each tree in the original and saved/loaded model and assert they are equal
Expand Down Expand Up @@ -102,14 +102,14 @@ class IsolationForestModelWriteReadTest extends Logging {

// Write the trained model to disk and then read it back from disk
val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModelZeroContamination"
isolationForestModel1.write.overwrite.save(savePath)
isolationForestModel1.write.overwrite().save(savePath)
val isolationForestModel2 = IsolationForestModel.load(savePath)
deleteDirectory(new File(savePath))

// Assert that all parameter values are equal
Assert.assertEquals(
isolationForestModel1.extractParamMap.toString,
isolationForestModel2.extractParamMap.toString)
isolationForestModel1.extractParamMap().toString,
isolationForestModel2.extractParamMap().toString)
Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples)
Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures)
Assert.assertEquals(
Expand All @@ -128,8 +128,8 @@ class IsolationForestModelWriteReadTest extends Logging {
Assert.assertEquals(auroc1, auroc2)

// Assert the predicted labels are equal and always 0.0
val predictedLabels1 = scores1.map(x => x.predictedLabel).collect
val predictedLabels2 = scores2.map(x => x.predictedLabel).collect
val predictedLabels1 = scores1.map(x => x.predictedLabel).collect()
val predictedLabels2 = scores2.map(x => x.predictedLabel).collect()
val expectedLabels = Array.fill[Double](predictedLabels1.length)(0.0)
Assert.assertEquals(predictedLabels1.toSeq, predictedLabels2.toSeq)
Assert.assertEquals(predictedLabels2.toSeq, expectedLabels.toSeq)
Expand Down Expand Up @@ -182,7 +182,7 @@ class IsolationForestModelWriteReadTest extends Logging {

// Write the trained model to disk and then read it back from disk
val savePath = System.getProperty("java.io.tmpdir") + "/savedIsolationForestModelIdenticalFeatures"
isolationForestModel1.write.overwrite.save(savePath)
isolationForestModel1.write.overwrite().save(savePath)
val isolationForestModel2 = IsolationForestModel.load(savePath)
deleteDirectory(new File(savePath))

Expand All @@ -197,8 +197,8 @@ class IsolationForestModelWriteReadTest extends Logging {
val scores2 = isolationForestModel2.transform(data).as[ScoringResult]

Assert.assertEquals(
scores1.map(x => x.outlierScore).collect.toSeq,
scores2.map(x => x.outlierScore).collect.toSeq)
scores1.map(x => x.outlierScore).collect().toSeq,
scores2.map(x => x.outlierScore).collect().toSeq)

spark.stop()
}
Expand All @@ -214,14 +214,14 @@ class IsolationForestModelWriteReadTest extends Logging {

// Write the trained model to disk and then read it back from disk
val savePath = System.getProperty("java.io.tmpdir") + "/emptyIsolationForestModelWriteReadTest"
isolationForestModel1.write.overwrite.save(savePath)
isolationForestModel1.write.overwrite().save(savePath)
val isolationForestModel2 = IsolationForestModel.load(savePath)
deleteDirectory(new File(savePath))

// Assert that all parameter values are equal
Assert.assertEquals(
isolationForestModel1.extractParamMap.toString,
isolationForestModel2.extractParamMap.toString)
isolationForestModel1.extractParamMap().toString,
isolationForestModel2.extractParamMap().toString)
Assert.assertEquals(isolationForestModel1.getNumSamples, isolationForestModel2.getNumSamples)
Assert.assertEquals(isolationForestModel1.getNumFeatures, isolationForestModel2.getNumFeatures)
Assert.assertEquals(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,13 @@ class IsolationForestTest {
.setContaminationError(contamination * 0.01)
.setRandomSeed(1)

isolationForest1.write.overwrite.save(savePath)
isolationForest1.write.overwrite().save(savePath)
val isolationForest2 = IsolationForest.load(savePath)
deleteDirectory(new File(savePath))

Assert.assertEquals(
isolationForest1.extractParamMap.toString,
isolationForest2.extractParamMap.toString)
isolationForest1.extractParamMap().toString,
isolationForest2.extractParamMap().toString)

spark.stop()
}
Expand Down Expand Up @@ -148,7 +148,7 @@ class IsolationForestTest {

// Calculate area under ROC curve and assert
val scores = isolationForestModel.transform(data).as[ScoringResult]
val predictedLabels = scores.map(x => x.predictedLabel).collect
val predictedLabels = scores.map(x => x.predictedLabel).collect()
val expectedLabels = Array.fill[Double](predictedLabels.length)(0.0)

Assert.assertEquals(
Expand Down Expand Up @@ -195,10 +195,10 @@ class IsolationForestTest {

val labeledOutlierScoresMean = labeledOutlierScores
.map(_.outlierScore)
.reduce(_+_) / labeledOutlierScores.count
.reduce(_+_) / labeledOutlierScores.count()
val labeledInlierScoresMean = labeledInlierScores
.map(_.outlierScore)
.reduce(_+_) / labeledInlierScores.count
.reduce(_+_) / labeledInlierScores.count()

val uncert = 0.02
val expectedOutlierScoreMean = 0.61
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ object TestUtils {
}

// local context with 4 threads
SparkSession.builder
SparkSession.builder()
.master("local[4]")
.appName("testing-spark")
.config(sparkConf)
Expand Down

0 comments on commit 73a4a45

Please sign in to comment.