-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScalaWordCount.scala
55 lines (44 loc) · 2.27 KB
/
ScalaWordCount.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.storage._
object ScalaWordCount
{
def main(args: Array[String])
{
val conf = new SparkConf().setMaster("local").setAppName("Scala word count")
val sc = new SparkContext(conf)
val input = sc.textFile("SOFTwitter.json")
val words = input.flatMap(line => line.split(" "))
val counts = words.map(word => (word,1)).reduceByKey{case (x,y) => x + y}
//counts.saveAsTextFile("wordCountOutput")
/**************************************************************/
/* take() and foreach() example */
/**************************************************************/
counts.take(10).foreach(println)
/**************************************************************/
/* persist() allow specifying the storage level */
/* cache() takes the default */
/**************************************************************/
val memCounts = counts.persist(StorageLevel.MEMORY_ONLY_SER)
println("After persist() call * * * " + memCounts.count())
println(memCounts.toDebugString)
/**************************************************************/
/* Filter example using braces for function. nothing vbl not */
/* used. */
/**************************************************************/
val filterCounts = input.filter
{
val nothing = 0
line => line.contains("Scala")
}
println("Filtered count * * * " + filterCounts.count())
/**************************************************************/
/* Create a file using parallelize() */
/**************************************************************/
val listLines = sc.parallelize(List("kushlie", "lowly", "corduroy"))
println("* * * " + listLines.count())
println("here some the lines => ")
listLines.collect().foreach(println)
}
}