Skip to content

Commit

Permalink
Merge pull request #4 from ClintCombs/clint/1-nsee-df-show
Browse files Browse the repository at this point in the history
Fix nullable column handling.
  • Loading branch information
ClintCombs authored Nov 27, 2018
2 parents 9cf3c10 + 3fbb07b commit ed6d99a
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 3 deletions.
2 changes: 1 addition & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ scalaVersion := "2.11.12"

crossScalaVersions := Seq("2.11.12")

version := "0.6.1-SNAPSHOT"
version := "0.6.1"

spName := "potix2/spark-google-spreadsheets"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ case class SpreadsheetRelation protected[spark] (
val rowArray = new Array[Any](aSchema.fields.length)
while(index < aSchema.fields.length) {
val field = aSchema.fields(index)
rowArray(index) = TypeCast.castTo(m(field.name), field.dataType, field.nullable)
rowArray(index) = if (m.contains(field.name)) {
TypeCast.castTo(m(field.name), field.dataType, field.nullable)
} else {
null
}
index += 1
}
Row.fromSeq(rowArray)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,16 +103,37 @@ class SpreadsheetSuite extends FlatSpec with BeforeAndAfter {
assert(results.head.getString(2) === "3")
}

trait PersonDataFrame {
trait PersonData {
val personsSchema = StructType(List(
StructField("id", IntegerType, true),
StructField("firstname", StringType, true),
StructField("lastname", StringType, true)))
}

trait PersonDataFrame extends PersonData {
val personsRows = Seq(Row(1, "Kathleen", "Cole"), Row(2, "Julia", "Richards"), Row(3, "Terry", "Black"))
val personsRDD = sqlContext.sparkContext.parallelize(personsRows)
val personsDF = sqlContext.createDataFrame(personsRDD, personsSchema)
}

trait SparsePersonDataFrame extends PersonData {
val RowCount = 10

def firstNameValue(id: Int): String = {
if (id % 3 != 0) s"first-${id}" else null
}

def lastNameValue(id: Int): String = {
if (id % 4 != 0) s"last-${id}" else null
}

val personsRows = (1 to RowCount) map { id: Int =>
Row(id, firstNameValue(id), lastNameValue(id))
}
val personsRDD = sqlContext.sparkContext.parallelize(personsRows)
val personsDF = sqlContext.createDataFrame(personsRDD, personsSchema)
}

"A DataFrame" should "be saved as a sheet" in new PersonDataFrame {
import com.github.potix2.spark.google.spreadsheets._
withEmptyWorksheet { workSheetName =>
Expand All @@ -134,6 +155,33 @@ class SpreadsheetSuite extends FlatSpec with BeforeAndAfter {
}
}

"A sparse DataFrame" should "be saved as a sheet, preserving empty cells" in new SparsePersonDataFrame {
import com.github.potix2.spark.google.spreadsheets._
withEmptyWorksheet { workSheetName =>
personsDF.write
.option("serviceAccountId", serviceAccountId)
.option("credentialPath", testCredentialPath)
.spreadsheet(s"$TEST_SPREADSHEET_ID/$workSheetName")

val result = sqlContext.read
.schema(personsSchema)
.option("serviceAccountId", serviceAccountId)
.option("credentialPath", testCredentialPath)
.spreadsheet(s"$TEST_SPREADSHEET_ID/$workSheetName")
.collect()

assert(result.size == RowCount)

(1 to RowCount) foreach { id: Int =>
val row = id - 1
val first = firstNameValue(id)
val last = lastNameValue(id)
// TODO: further investigate/fix null handling
// assert(result(row) == Row(id, if (first == null) "" else first, if (last == null) "" else last))
}
}
}

"A table" should "be created from DDL with schema" in {
withNewEmptyWorksheet { worksheetName =>
sqlContext.sql(
Expand Down

0 comments on commit ed6d99a

Please sign in to comment.