-
Notifications
You must be signed in to change notification settings - Fork 385
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Utility function to get a setup & cleanup function for mapping each partition #456
base: master
Are you sure you want to change the base?
Changes from 8 commits
56d6f5c
b648558
aebda1d
e347e49
a93d195
0807ee3
74c606a
b69c619
2a33765
dc3dbbc
8387fdb
442e941
90c7fab
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
package spark.api.java; | ||
|
||
import java.io.Serializable; | ||
|
||
public abstract class JavaDoublePartitionMapper<T> implements Serializable { | ||
|
||
public abstract void setup(int partition); | ||
|
||
public abstract Double map(T t) throws Exception; | ||
|
||
public abstract void cleanup(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package spark.api.java; | ||
|
||
import scala.Tuple2; | ||
|
||
import java.io.Serializable; | ||
|
||
public abstract class JavaPairPartitionMapper<T, K, V> implements Serializable { | ||
|
||
public abstract void setup(int partition); | ||
|
||
public abstract Tuple2<K,V> map(T t) throws Exception; | ||
|
||
public abstract void cleanup(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,8 @@ import spark.api.java.function.{Function2 => JFunction2, Function => JFunction, | |
import spark.partial.{PartialResult, BoundedDouble} | ||
import spark.storage.StorageLevel | ||
import com.google.common.base.Optional | ||
import spark.RDD.PartitionMapper | ||
import spark.api.java.ManifestHelper.fakeManifest | ||
|
||
|
||
trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround[T] { | ||
|
@@ -116,6 +118,41 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends PairFlatMapWorkaround | |
JavaPairRDD.fromRDD(rdd.mapPartitions(fn))(f.keyType(), f.valueType()) | ||
} | ||
|
||
/** | ||
* Return a new RDD by applying a function to each element of the RDD, with an additional | ||
* setup & cleanup that happens before & after computing each partition | ||
*/ | ||
def mapWithSetupAndCleanup[U](m: PartitionMapper[T,U]): JavaRDD[U] = { | ||
JavaRDD.fromRDD(rdd.mapWithSetupAndCleanup(m)(fakeManifest[U]))(fakeManifest[U]) | ||
} | ||
|
||
/** | ||
* Return a new RDD by applying a function to each element of the RDD, with an additional | ||
* setup & cleanup that happens before & after computing each partition | ||
*/ | ||
def mapWithSetupAndCleanup[K,V](m: JavaPairPartitionMapper[T,K,V]): JavaPairRDD[K,V] = { | ||
val scalaMapper = new PartitionMapper[T,(K,V)] { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can |
||
def setup(partition:Int) = m.setup(partition) | ||
def map(t:T) = m.map(t) | ||
def cleanup = m.cleanup() | ||
} | ||
JavaPairRDD.fromRDD(rdd.mapWithSetupAndCleanup(scalaMapper)(fakeManifest[(K,V)]))( | ||
fakeManifest[K], fakeManifest[V]) | ||
} | ||
|
||
/** | ||
* Return a new RDD by applying a function to each element of the RDD, with an additional | ||
* setup & cleanup that happens before & after computing each partition | ||
*/ | ||
def mapWithSetupAndCleanup(m: JavaDoublePartitionMapper[T]): JavaDoubleRDD = { | ||
val scalaMapper = new PartitionMapper[T,Double] { | ||
def setup(partition:Int) = m.setup(partition) | ||
def map(t:T) = m.map(t) | ||
def cleanup = m.cleanup() | ||
} | ||
JavaDoubleRDD.fromRDD(rdd.mapWithSetupAndCleanup(scalaMapper)(manifest[Double])) | ||
} | ||
|
||
/** | ||
* Return an RDD created by coalescing all elements within each partition into an array. | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
package spark.api.java; | ||
|
||
import scala.reflect.ClassManifest; | ||
import scala.reflect.ClassManifest$; | ||
|
||
class ManifestHelper { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This ManifestHelper class is a good idea. We could also use it to create the fake manifests in the Java |
||
|
||
public static <R> ClassManifest<R> fakeManifest() { | ||
return (ClassManifest<R>) ClassManifest$.MODULE$.fromClass(Object.class); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package spark.rdd | ||
|
||
import spark.{TaskContext, Split, RDD} | ||
import spark.RDD.PartitionMapper | ||
|
||
/** | ||
* | ||
*/ | ||
|
||
class MapPartitionsWithSetupAndCleanup[U: ClassManifest, T: ClassManifest]( | ||
prev: RDD[T], | ||
m: PartitionMapper[T,U], | ||
preservesPartitioning: Boolean | ||
) extends RDD[U](prev){ | ||
|
||
override def getSplits = firstParent[T].splits | ||
|
||
override val partitioner = if (preservesPartitioning) prev.partitioner else None | ||
|
||
override def compute(split: Split, context: TaskContext) = { | ||
context.addOnCompleteCallback(m.cleanup _) | ||
m.setup(split.index) | ||
firstParent[T].iterator(split, context).map(m.map _) | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,11 @@ | ||
package spark | ||
|
||
import scala.collection.mutable.HashMap | ||
import scala.collection.Set | ||
import org.scalatest.FunSuite | ||
import spark.SparkContext._ | ||
import spark.rdd.{CoalescedRDD, PartitionPruningRDD} | ||
import spark.RDD.PartitionMapper | ||
|
||
class RDDSuite extends FunSuite with LocalSparkContext { | ||
|
||
|
@@ -173,4 +175,48 @@ class RDDSuite extends FunSuite with LocalSparkContext { | |
assert(prunedData.size === 1) | ||
assert(prunedData(0) === 10) | ||
} | ||
|
||
test("mapPartitionWithSetupAndCleanup") { | ||
sc = new SparkContext("local[4]", "test") | ||
val data = sc.parallelize(1 to 100, 4) | ||
val acc = sc.accumulableCollection(new HashMap[Int,Set[Int]]()) | ||
val mapped = data.mapWithSetupAndCleanup(new PartitionMapper[Int,Int](){ | ||
var partition = -1 | ||
var values = Set[Int]() | ||
def setup(partition:Int) {this.partition = partition} | ||
def map(i:Int) = {values += i; i * 2} | ||
def cleanup = { | ||
//the purpose of this strange code is just to make sure this method is called | ||
// after the data has been iterated through completely. | ||
acc.localValue += (partition -> values) | ||
} | ||
}).collect | ||
|
||
assert(mapped.toSet === (1 to 100).map{_ * 2}.toSet) | ||
assert(acc.value.keySet == (0 to 3).toSet) | ||
acc.value.foreach { case(partition, values) => | ||
assert(values.size === 25) | ||
} | ||
|
||
|
||
//the naive alternative doesn't work | ||
val acc2 = sc.accumulableCollection(new HashMap[Int,Set[Int]]()) | ||
val m2 = data.mapPartitionsWithSplit{ | ||
case (partition, itr) => | ||
var values = Set[Int]() | ||
val mItr = itr.map{i => values += i; i * 2} | ||
//you haven't actually put anything into values yet, b/c itr.map defines another | ||
// iterator, which is lazily computed. so the Set is empty | ||
acc2.localValue += (partition -> values) | ||
mItr | ||
}.collect | ||
|
||
assert(m2.toSet === (1 to 100).map{_ * 2}.toSet) | ||
assert(acc2.value.keySet === (0 to 3).toSet) | ||
//this condition will fail | ||
// acc2.value.foreach { case(partition, values) => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If this is supposed to fail, should we wrap it in an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well, its not really supposed to fail -- there is no exception, it just doesn't give the "expected" result. that second part isn't really a unit test at all, its just documentation of why this method is needed. probably doesn't belong here at all, I just wanted it as part of the pull request to demonstrate why the method was needed. I guess I can just write it up somewhere else (seems too long to put in the spark docs also, at least in the current layout ...) |
||
// assert(values.size === 25) | ||
// } | ||
|
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would rename the instances of WithSetupAndCleanup to just WithCleanup for simplicity. Having cleanup will also imply that there's something to be cleaned up.