diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 67395f2..7e1d68e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,5 +23,7 @@ jobs: - uses: coursier/setup-action@v1 with: jvm: adopt:1.8 - - name: Build, test, and package project - run: bin/sbt clean compile test package makePom + - name: Build, test, and package project on Spark 3.5 + run: bin/sbt clean compile test package makePom -DsparkVersion=3.5.1 + - name: Build and package project on "legacy" Spark + run: bin/sbt clean compile package makePom diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index baa6c8f..4c6c21c 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -25,3 +25,5 @@ jobs: # uses sbt-github-packages, see build.sbt - name: Publish with SBT run: bin/sbt publish + - name: Publish with SBT + run: bin/sbt publish -DsparkVersion=3.5.1 diff --git a/README.md b/README.md index c4bbe6d..c7aee7f 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,9 @@ Usage: data-validator [options] --help Show this help message and exit. ``` +If you want to build with Java 11 or newer, set the "MODERN_JAVA" environment variable. +This may become the default in the future. + ## Example Run With the JAR directly: diff --git a/bin/sbt b/bin/sbt index 19d8917..8ecfd33 100755 --- a/bin/sbt +++ b/bin/sbt @@ -34,11 +34,11 @@ set -o pipefail -declare -r sbt_release_version="1.6.2" -declare -r sbt_unreleased_version="1.7.0-M2" +declare -r sbt_release_version="1.9.9" +declare -r sbt_unreleased_version="1.9.9" -declare -r latest_213="2.13.8" -declare -r latest_212="2.12.15" +declare -r latest_213="2.13.13" +declare -r latest_212="2.12.19" declare -r latest_211="2.11.12" declare -r latest_210="2.10.7" declare -r latest_29="2.9.3" diff --git a/build.sbt b/build.sbt index 61e2c89..c0034ad 100644 --- a/build.sbt +++ b/build.sbt @@ -1,11 +1,42 @@ name := "data-validator" organization := "com.target" -scalaVersion := "2.11.12" +val sparkVersion = settingKey[String]("Spark version") -val sparkVersion = "2.3.4" +sparkVersion := System.getProperty("sparkVersion", "2.3.4") -val circeVersion = "0.11.2" +scalaVersion := { + if (sparkVersion.value > "3.0") { + "2.12.19" + } else { + "2.11.12" + } +} + +val sparkValidationVersion = settingKey[String]("Version of package") + +sparkValidationVersion := "0.15.0" + +version := sparkVersion.value + "_" + sparkValidationVersion.value + +val circeVersion = settingKey[String]("Circe version") +val circeYamlVersion = settingKey[String]("Circe YAML version") + +circeVersion := { + if (sparkVersion.value > "3.0") { + "0.14.6" + } else { + "0.11.2" + } +} + +circeYamlVersion := { + if (sparkVersion.value > "3.0") { + "0.15.1" + } else { + "0.10.1" + } +} //addDependencyTreePlugin enablePlugins(GitVersioning) @@ -35,18 +66,28 @@ libraryDependencies ++= Seq( "com.github.scopt" %% "scopt" % "4.1.0", "com.sun.mail" % "javax.mail" % "1.6.2", "com.lihaoyi" %% "scalatags" % "0.12.0", - "io.circe" %% "circe-yaml" % "0.10.1", - "io.circe" %% "circe-core" % circeVersion, - "io.circe" %% "circe-generic" % circeVersion, - "io.circe" %% "circe-parser" % circeVersion, - "org.apache.spark" %% "spark-sql" % sparkVersion % Provided, + "io.circe" %% "circe-yaml" % circeYamlVersion.value, + "io.circe" %% "circe-core" % circeVersion.value, + "io.circe" %% "circe-generic" % circeVersion.value, + "io.circe" %% "circe-parser" % circeVersion.value, + "org.apache.spark" %% "spark-sql" % sparkVersion.value % Provided, "junit" % "junit" % "4.13.2" % Test, - "org.scalatest" %% "scalatest" % "3.2.17" % Test, + "org.scalatest" %% "scalatest" % "3.2.18" % Test, "com.github.sbt" % "junit-interface" % "0.13.3" % Test exclude ("junit", "junit-dep") ) Test / fork := true -javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:+CMSClassUnloadingEnabled") +javaOptions ++= (if (sparkVersion.value > "3.0" && System.getenv("MODERN_JAVA") == "TRUE") { + // For modern Java we need to open up a lot of config options. + Seq("-Xms4048M", "-Xmx4048M", + // these were added in JDK 11 and newer, apparently. + "-Dio.netty.tryReflectionSetAccessible=true", + "--add-opens=java.base/java.lang=ALL-UNNAMED", + "--add-opens=java.base/java.io=ALL-UNNAMED", + "--add-opens=java.base/sun.nio.ch=ALL-UNNAMED") +} else { + Seq("-Xms4048M", "-Xmx4048M") +}) Test / parallelExecution := false // required for unit tests, but not set in some environments Test / envVars ++= Map( @@ -57,6 +98,11 @@ Test / envVars ++= Map( assembly / mainClass := Some("com.target.data_validator.Main") +assembly / assemblyShadeRules := Seq( + ShadeRule.rename("shapeless.**" -> "new_shapeless.@1").inAll, + ShadeRule.rename("cats.kernel.**" -> s"new_cats.kernel.@1").inAll + ) + // Enforces scalastyle checks val compileScalastyle = TaskKey[Unit]("compileScalastyle") scalastyleFailOnWarning := true @@ -75,6 +121,6 @@ compileScalastyle := (Compile / scalastyle).toTask("").value (Compile / runMain) := Defaults.runMainTask(Compile / fullClasspath, Compile / run / runner).evaluated TaskKey[Unit]("generateTestData") := { - libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion + libraryDependencies += "org.apache.spark" %% "spark-sql" % sparkVersion.value (Compile / runMain).toTask(" com.target.data_validator.GenTestData").value } diff --git a/project/assembly.sbt b/project/assembly.sbt index d83c883..e5ab6cc 100644 --- a/project/assembly.sbt +++ b/project/assembly.sbt @@ -1 +1 @@ -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.1.5") +addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0") diff --git a/project/build.properties b/project/build.properties index 0aa5c39..49214c4 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version = 1.9.8 +sbt.version = 1.9.9 diff --git a/project/plugins.sbt b/project/plugins.sbt index 742243d..a56f0eb 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,4 +2,5 @@ addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") addSbtPlugin("com.github.sbt" % "sbt-git" % "2.0.1") addSbtPlugin("com.eed3si9n" % "sbt-buildinfo" % "0.12.0") addSbtPlugin("com.codecommit" % "sbt-github-packages" % "0.5.3") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.10.4") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") diff --git a/src/main/scala/com/target/data_validator/validator/ColumnBased.scala b/src/main/scala/com/target/data_validator/validator/ColumnBased.scala index a283ab2..21c43c7 100644 --- a/src/main/scala/com/target/data_validator/validator/ColumnBased.scala +++ b/src/main/scala/com/target/data_validator/validator/ColumnBased.scala @@ -86,7 +86,7 @@ case class MinNumRows(minNumRows: Json) extends ColumnBased("", ValidatorBase.L0 } case class ColumnMaxCheck(column: String, value: Json) - extends ColumnBased(column, Max(UnresolvedAttribute(column)).toAggregateExpression()) { + extends ColumnBased(column, Max(UnresolvedAttribute.quoted(column)).toAggregateExpression()) { override def substituteVariables(dict: VarSubstitution): ValidatorBase = { val ret = copy(column = getVarSub(column, "column", dict), value = getVarSubJson(value, "value", dict)) diff --git a/src/main/scala/com/target/data_validator/validator/ColumnSumCheck.scala b/src/main/scala/com/target/data_validator/validator/ColumnSumCheck.scala index 6a01263..d5196f9 100644 --- a/src/main/scala/com/target/data_validator/validator/ColumnSumCheck.scala +++ b/src/main/scala/com/target/data_validator/validator/ColumnSumCheck.scala @@ -16,7 +16,7 @@ case class ColumnSumCheck( minValue: Option[Json] = None, maxValue: Option[Json] = None, inclusive: Option[Json] = None -) extends ColumnBased(column, Sum(UnresolvedAttribute(column)).toAggregateExpression()) { +) extends ColumnBased(column, Sum(UnresolvedAttribute.quoted(column)).toAggregateExpression()) { private val minOrMax: Either[String, Unit] = if (minValue.isEmpty && maxValue.isEmpty) { Left("'minValue' or 'maxValue' or both must be defined") @@ -78,13 +78,14 @@ case class ColumnSumCheck( } def getData(pctError: String): ListMap[String, String] = { - ((minValue, maxValue) match { + val initial: ListMap[String, String] = ((minValue, maxValue) match { case (Some(x), Some(y)) => ListMap("lower_bound" -> x.asNumber.get.toString, "upper_bound" -> y.asNumber.get.toString) case (None, Some(y)) => ListMap("upper_bound" -> y.asNumber.get.toString) case (Some(x), None) => ListMap("lower_bound" -> x.asNumber.get.toString) case (None, None) => throw new RuntimeException("Must define at least one of minValue or maxValue.") - }) + ("inclusive" -> isInclusive.toString, "actual" -> r(idx).toString, "relative_error" -> pctError) + }) + initial ++ List("inclusive" -> isInclusive.toString, "actual" -> r(idx).toString, "relative_error" -> pctError) } val actualSum: Double = dataType match { diff --git a/src/main/scala/com/target/data_validator/validator/ValidatorBase.scala b/src/main/scala/com/target/data_validator/validator/ValidatorBase.scala index 0134a3a..17d46af 100644 --- a/src/main/scala/com/target/data_validator/validator/ValidatorBase.scala +++ b/src/main/scala/com/target/data_validator/validator/ValidatorBase.scala @@ -141,8 +141,8 @@ object ValidatorBase extends LazyLogging { private val backtick = "`" val I0: Literal = Literal.create(0, IntegerType) val D0: Literal = Literal.create(0.0, DoubleType) - val L0: Literal = Literal.create(0, LongType) - val L1: Literal = Literal.create(1, LongType) + val L0: Literal = Literal.create(0L, LongType) + val L1: Literal = Literal.create(1L, LongType) def isValueColumn(v: String): Boolean = v.startsWith(backtick) diff --git a/src/test/scala/com/target/data_validator/validator/RangeCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/RangeCheckSpec.scala index 9a77175..37e2198 100644 --- a/src/test/scala/com/target/data_validator/validator/RangeCheckSpec.scala +++ b/src/test/scala/com/target/data_validator/validator/RangeCheckSpec.scala @@ -305,7 +305,7 @@ class RangeCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession { ValidatorQuickCheckError( ("item", "Eggs") :: Nil, 5.99, - "RangeCheck failed! max = 5.99 and (('max <= 6.0) || ('max >= 10.0))" + "RangeCheck failed! max = 5.99 and (('max <= 6.0) OR ('max >= 10.0))" ) ) } @@ -341,7 +341,7 @@ class RangeCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession { val dict = new VarSubstitution val df = mkDataFrame(spark, defData) assert(!sut.configCheck(df)) - assert(sut.colTest(df.schema, dict).sql == "((`avg` < `min`) OR (`avg` > `max`))") + assert(sut.colTest(df.schema, dict).sql == "((avg < min) OR (avg > max))") } it("bad minValue column") { @@ -395,7 +395,7 @@ class RangeCheckSpec extends AnyFunSpec with Matchers with TestingSparkSession { ValidatorQuickCheckError( ("item", "Bread") :: Nil, 0.99, - "RangeCheck failed! avg = 0.99 and (('avg <= 'min) || ('avg >= 'max))" + "RangeCheck failed! avg = 0.99 and (('avg <= 'min) OR ('avg >= 'max))" ) ) } diff --git a/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala index 3e0493f..0166030 100644 --- a/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala +++ b/src/test/scala/com/target/data_validator/validator/StringLengthCheckSpec.scala @@ -327,13 +327,13 @@ class StringLengthCheckSpec extends AnyFunSpec with Matchers with TestingSparkSe ValidatorQuickCheckError( ("item", "I") :: Nil, "I", - "StringLengthCheck failed! item = I and ((length('item) < 5) || (length('item) > 6))" + "StringLengthCheck failed! item = I and ((length('item) < 5) OR (length('item) > 6))" )) ^ (sut.getEvents contains ValidatorQuickCheckError( ("item", "") :: Nil, "", - "StringLengthCheck failed! item = and ((length('item) < 5) || (length('item) > 6))" + "StringLengthCheck failed! item = and ((length('item) < 5) OR (length('item) > 6))" )) ) } @@ -364,7 +364,7 @@ class StringLengthCheckSpec extends AnyFunSpec with Matchers with TestingSparkSe ValidatorQuickCheckError( ("item", "I") :: Nil, "I", - "StringLengthCheck failed! item = I and ((length('item) < 5) || (length('item) > 6))" + "StringLengthCheck failed! item = I and ((length('item) < 5) OR (length('item) > 6))" ) ) @@ -373,7 +373,7 @@ class StringLengthCheckSpec extends AnyFunSpec with Matchers with TestingSparkSe ValidatorQuickCheckError( ("item", "") :: Nil, "", - "StringLengthCheck failed! item = and ((length('item) < 5) || (length('item) > 6))" + "StringLengthCheck failed! item = and ((length('item) < 5) OR (length('item) > 6))" ) ) } @@ -404,7 +404,7 @@ class StringLengthCheckSpec extends AnyFunSpec with Matchers with TestingSparkSe ValidatorQuickCheckError( ("item", "I") :: Nil, "I", - "StringLengthCheck failed! item = I and ((length('item) < 5) || (length('item) > 5))" + "StringLengthCheck failed! item = I and ((length('item) < 5) OR (length('item) > 5))" ) ) @@ -413,7 +413,7 @@ class StringLengthCheckSpec extends AnyFunSpec with Matchers with TestingSparkSe ValidatorQuickCheckError( ("item", "") :: Nil, "", - "StringLengthCheck failed! item = and ((length('item) < 5) || (length('item) > 5))" + "StringLengthCheck failed! item = and ((length('item) < 5) OR (length('item) > 5))" ) ) @@ -422,7 +422,7 @@ class StringLengthCheckSpec extends AnyFunSpec with Matchers with TestingSparkSe ValidatorQuickCheckError( ("item", "Item23") :: Nil, "Item23", - "StringLengthCheck failed! item = Item23 and ((length('item) < 5) || (length('item) > 5))" + "StringLengthCheck failed! item = Item23 and ((length('item) < 5) OR (length('item) > 5))" ) ) } diff --git a/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala b/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala index 542049e..23b59c9 100644 --- a/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala +++ b/src/test/scala/com/target/data_validator/validator/StringRegexCheckSpec.scala @@ -183,7 +183,7 @@ class StringRegexCheckSpec extends AnyFunSpec with Matchers with TestingSparkSes ValidatorQuickCheckError( ("item", "I") :: Nil, "I", - "StringRegexCheck failed! item = I and (NOT 'item RLIKE ^It && isnotnull('item))" + "StringRegexCheck failed! item = I and (NOT RLIKE('item, ^It) AND isnotnull('item))" ) ) } @@ -214,7 +214,7 @@ class StringRegexCheckSpec extends AnyFunSpec with Matchers with TestingSparkSes ValidatorQuickCheckError( ("item", "I") :: Nil, "I", - "StringRegexCheck failed! item = I and (NOT 'item RLIKE ^Item2 && isnotnull('item))" + "StringRegexCheck failed! item = I and (NOT RLIKE('item, ^Item2) AND isnotnull('item))" ) ) @@ -223,7 +223,7 @@ class StringRegexCheckSpec extends AnyFunSpec with Matchers with TestingSparkSes ValidatorQuickCheckError( ("item", "Item1") :: Nil, "Item1", - "StringRegexCheck failed! item = Item1 and (NOT 'item RLIKE ^Item2 && isnotnull('item))" + "StringRegexCheck failed! item = Item1 and (NOT RLIKE('item, ^Item2) AND isnotnull('item))" ) ) } @@ -254,13 +254,13 @@ class StringRegexCheckSpec extends AnyFunSpec with Matchers with TestingSparkSes ValidatorQuickCheckError( ("item", "I") :: Nil, "I", - "StringRegexCheck failed! item = I and (NOT 'item RLIKE ^Item2 && isnotnull('item))" + "StringRegexCheck failed! item = I and (NOT RLIKE('item, ^Item2) AND isnotnull('item))" )) ^ (sut.getEvents contains ValidatorQuickCheckError( ("item", "Item1") :: Nil, "Item1", - "StringRegexCheck failed! item = Item1 and (NOT 'item RLIKE ^Item2 && isnotnull('item))" + "StringRegexCheck failed! item = Item1 and (NOT RLIKE('item, ^Item2) AND isnotnull('item))" )) ) }