Skip to content

Latest commit

 

History

History
30 lines (24 loc) · 765 Bytes

scala-spark.md

File metadata and controls

30 lines (24 loc) · 765 Bytes

Scala Spark

Import libraries and create Hive context

import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.hive._
import org.apache.spark.sql.types.{StructType,StructField,StringType,DoubleType}
import org.apache.spark.rdd.RDD

val hc = new org.apache.spark.sql.hive.HiveContext(sc)
import hc.implicits._

DataFrame - API

val df = hc.sql("select * from db.table")

df.show()
df.printSchema()
df.columns

df.select("name").show()
df.groupBy("age").count().show()

val foo = df.groupBy("col1", "col2")
            .count()
            .filter("col2 = 'value'")
            .sort($"count".desc)
            .drop("col2")