Merge pull request #8 from philwalk/bugfix-01

remove unused code; fix WSL bugs; update dependent libraries
philwalk · Nov 21, 2024 · 69e6c8c · 69e6c8c
2 parents 0426d69 + a64b289
commit 69e6c8c
Show file tree

Hide file tree

Showing 6 changed files with 229 additions and 42 deletions.
diff --git a/build.sbt b/build.sbt
@@ -1,5 +1,5 @@
-//lazy val scala213 = "2.13.13"
-lazy val scala3 = "3.3.3"
+//lazy val scala213 = "2.13.14"
+lazy val scala3   = "3.4.2"
 lazy val scalaVer = scala3
 
 lazy val supportedScalaVersions = List(scala3)
@@ -8,10 +8,11 @@ lazy val supportedScalaVersions = List(scala3)
 javacOptions ++= Seq("-source", "11", "-target", "11")
 
 //enablePlugins(ScalaNativePlugin)
+//nativeLinkStubs := true
 
 //ThisBuild / envFileName   := "dev.env" // sbt-dotenv plugin gets build environment here
 ThisBuild / scalaVersion  := scalaVer
-ThisBuild / version       := "0.10.11"
+ThisBuild / version       := "0.10.15" // jetison  // last org.simpleflatmapper sfm-csv version is 0.10.11
 ThisBuild / versionScheme := Some("semver-spec")
 
 ThisBuild / organization         := "org.vastblue"
@@ -53,15 +54,15 @@ ThisBuild / publishTo := {
 
 ThisBuild / publishMavenStyle.withRank(KeyRanks.Invisible) := true
 
-ThisBuild / crossScalaVersions := supportedScalaVersions
-
 // For all Sonatype accounts created on or after February 2021
 ThisBuild / sonatypeCredentialHost := "s01.oss.sonatype.org"
 
 resolvers += Resolver.mavenLocal
 
 publishTo := sonatypePublishToBundle.value
 
+ThisBuild / crossScalaVersions := supportedScalaVersions
+
 Compile / packageBin / packageOptions +=
   Package.ManifestAttributes(java.util.jar.Attributes.Name.CLASS_PATH -> "")
 
@@ -77,11 +78,12 @@ lazy val root = (project in file(".")).
   )
 
 libraryDependencies ++= Seq(
-  "org.scalatest"         %% "scalatest"       % "3.2.18" % Test,
-//"com.github.sbt"         % "junit-interface" % "0.13.3" % Test,
-  "org.simpleflatmapper"   % "sfm-csv"         % "9.0.0",
-  "io.github.chronoscala" %% "chronoscala"     % "2.0.10",
-  "org.vastblue"           % "unifile_3"       % "0.3.3",
+  "org.scalatest"            %% "scalatest"       % "3.2.19" % Test,
+//"com.github.sbt"            % "junit-interface" % "0.13.3" % Test,
+  "org.simpleflatmapper"      % "sfm-csv"         % "9.0.2",
+  "com.github.tototoshi"     %% "scala-csv"       % "2.0.0",
+  "io.github.chronoscala"    %% "chronoscala"     % "2.0.10",
+  "org.vastblue"              % "unifile_3"       % "0.3.5",
 )
 
 /*

diff --git a/project/build.properties b/project/build.properties
@@ -1,2 +1,2 @@
 #sbt.version=1.9.9
-sbt.version=1.10.0
+sbt.version=1.10.3
diff --git a/project/build.sbt b/project/build.sbt
@@ -1 +1 @@
-ThisBuild / scalaVersion := "2.12.19"
+ThisBuild / scalaVersion := "2.12.20"
diff --git a/project/plugins.sbt b/project/plugins.sbt
@@ -1,16 +1,16 @@
-scalaVersion := "2.12.19"
+scalaVersion := "2.12.20"
 
-val SONATYPE_VERSION = sys.env.getOrElse("SONATYPE_VERSION", "3.10.0") // "3.9.21")
+val SONATYPE_VERSION = sys.env.getOrElse("SONATYPE_VERSION", "3.12.2") // "3.10.0")
 
-//addSbtPlugin("org.scala-native" % "sbt-scala-native" % "0.4.17")
+//addSbtPlugin("org.scala-native" % "sbt-scala-native" % "0.5.3") // "0.4.17")
 
-addSbtPlugin("ch.epfl.scala"  % "sbt-bloop"     % "1.5.17")
-addSbtPlugin("ch.epfl.scala"  % "sbt-scalafix"  % "0.12.1")
+addSbtPlugin("ch.epfl.scala"  % "sbt-bloop"     % "2.0.5") // "2.0.3")
+addSbtPlugin("ch.epfl.scala"  % "sbt-scalafix"  % "0.13.0") // "0.12.1")
 addSbtPlugin("org.xerial.sbt" % "sbt-sonatype"  % SONATYPE_VERSION)
 addSbtPlugin("org.scalameta"  % "sbt-scalafmt"  % "2.5.2")
-addSbtPlugin("com.github.sbt" % "sbt-dynver"    % "5.0.1")
-addSbtPlugin("com.eed3si9n"   % "sbt-buildinfo" % "0.12.0")
-addSbtPlugin("com.github.sbt" % "sbt-pgp"       % "2.2.1")
+addSbtPlugin("com.github.sbt" % "sbt-dynver"    % "5.1.0") // "5.0.1")
+addSbtPlugin("com.eed3si9n"   % "sbt-buildinfo" % "0.13.1")
+addSbtPlugin("com.github.sbt" % "sbt-pgp"       % "2.3.0") // "2.2.1")
 
 addDependencyTreePlugin
 

diff --git a/src/main/scala/vastblue/file/FastCsv.scala b/src/main/scala/vastblue/file/FastCsv.scala
@@ -8,36 +8,18 @@ import org.simpleflatmapper.csv.*
 import java.io.{FileNotFoundException, Reader, StringReader, File as JFile}
 import java.nio.file.{Path, Files as JFiles, Paths as JPaths}
 import scala.jdk.CollectionConverters.*
+import scala.collection.immutable.ArraySeq
 
 /**
 * Csv Parser based on simpleflatmapper.
 * (replaces SimpleCsv)
 */
 object FastCsv {
 
-  // TODO: verify that this does not process more than the first line of the input String
-  def parseLine(str: String): List[String] = parseCsvLine(str) // alias
-  def parseCsvLine(str: String): List[String] = {
-    parseCsvStream(str).toList match {
-    case cols :: tail =>
-      cols.toList
-    case Nil =>
-      Nil
-    }
-  }
-  def parseCsvStream(str: String): Iterator[List[String]] = {
-    apply(str).iterator.map { _.toList }
-  }
-  def parseFile(infile: Path): FastCsv = {
-    FastCsv(infile, ",")
-  }
-//  def parseCsvFile(infile: Path): FastCsv = { // alias
-//    parseFile(infile)
-//  }
-
   def apply(jfile: JFile, _delimiter: String): FastCsv = {
     apply(jfile.toPath, _delimiter)
   }
+
   def apply(p: Path, delimiter: String = ""): FastCsv = {
     if (!p.isFile) {
       throw new java.nio.file.NoSuchFileException(s"${p.posx}")
@@ -49,10 +31,35 @@ object FastCsv {
     val reader        = new StringReader(str)
     new FastCsv(reader, p.toString, aDelimiter)
   }
+
   def apply(content: String): FastCsv = {
     new FastCsv(new StringReader(content), s"${content.take(10)}...", ",")
   }
 
+  // TODO: verify that this does not process more than the first line of the input String
+  def parseLine(str: String): ArraySeq[String] = parseCsvLine(str) // alias
+
+  def parseCsvLine(str: String): ArraySeq[String] = {
+    parseCsvStream(str) match {
+    case iter if iter.hasNext =>
+      iter.next()
+    case _ =>
+      ArraySeq.empty[String]
+    }
+  }
+
+  def parseCsvStream(str: String): Iterator[ArraySeq[String]] = {
+    val fastCsv = apply(str)
+    fastCsv.iterator.map(identity)
+  }
+
+  def parseFile(infile: Path): FastCsv = {
+    FastCsv(infile, ",")
+  }
+//  def parseCsvFile(infile: Path): FastCsv = { // alias
+//    parseFile(infile)
+//  }
+
   /* will not quit on error unless override ignoreErrors = false */
   def autoDetectDelimiter(sampleText: String, fname: String, ignoreErrors: Boolean = true): String = {
     var (tabs, commas, semis, pipes) = (0, 0, 0, 0)
@@ -103,12 +110,14 @@ case class FastCsv(val reader: Reader, identifier: String, delimiter: String) {
   case ";"  => ';'
   case _    => delimiter.charAt(0)
   }
-  def iterator: Iterator[Seq[String]] = CsvParser.separator(delim).iterator(reader).asScala.map { _.toSeq }
 
-  def rawrows: Seq[Seq[String]] = iterator.toSeq.filter { (cols: Seq[String]) => cols != Seq("") } // discard gratuitous empty rows
+  def rawrows: Seq[Seq[String]] = iterator.toSeq.filter { (cols: ArraySeq[String]) => cols != Seq("") } // discard gratuitous empty rows
   def rows                      = rawrows.map { row => row.map(_.trim) }
   def rowstrimmed               = rows
 
   // def stream = CsvParser.separator(delim).iterator(reader).asScala.iterator
   override def toString = identifier
+
+  import org.simpleflatmapper.csv.*
+  def iterator: Iterator[ArraySeq[String]] = CsvParser.separator(delim).iterator(reader).asScala.map { ArraySeq.unsafeWrapArray(_) }
 }
diff --git a/src/main/scala/vastblue/file/FastCsvTototoshi.scala b/src/main/scala/vastblue/file/FastCsvTototoshi.scala
@@ -0,0 +1,176 @@
+//#!/usr/bin/env -S scala -explain
+package vastblue.file
+
+import vastblue.pallet.*
+import vastblue.file.Util.*
+import com.github.tototoshi.csv.*
+
+import java.io.{FileNotFoundException, Reader, StringReader, File as JFile}
+import java.nio.file.{Path, Files as JFiles, Paths as JPaths}
+import scala.jdk.CollectionConverters.*
+
+/**
+* Csv Parser based on simpleflatmapper.
+* (replaces SimpleCsv)
+*/
+object FastCsvToto {
+
+  def apply(jfile: JFile, _delimiter: String): FastCsvToto = {
+    apply(jfile.toPath, _delimiter)
+  }
+
+  def apply(p: Path, delimiter: String = ""): FastCsvToto = {
+    if (!p.isFile) {
+      throw new java.nio.file.NoSuchFileException(s"${p.posx}")
+    }
+    val lines         = readLines(p)
+    def autoDelimiter = autoDetectDelimiter(lines.take(100).mkString("\n"), p.toString, ignoreErrors = false)
+    val aDelimiter    = if (delimiter.nonEmpty) delimiter else autoDelimiter
+    val str           = p.contentAsString
+    val reader        = new StringReader(str)
+    new FastCsvToto(reader, p.toString, aDelimiter)
+  }
+
+  def apply(content: String): FastCsvToto = {
+    new FastCsvToto(new StringReader(content), s"${content.take(10)}...", ",")
+  }
+
+  // TODO: verify that this does not process more than the first line of the input String
+  def parseLine(str: String): List[String] = parseCsvLine(str) // alias
+
+  def parseCsvLine(str: String): List[String] = {
+    parseCsvStream(str).toList match {
+    case cols :: tail =>
+      cols.toList
+    case Nil =>
+      Nil
+    }
+  }
+
+  def parseCsvStream(str: String): Iterator[List[String]] = {
+    apply(str).iterator.map { _.toList }
+  }
+
+  def parseFile(infile: Path): FastCsvToto = {
+    FastCsvToto(infile, ",")
+  }
+//  def parseCsvFile(infile: Path): FastCsvToto = { // alias
+//    parseFile(infile)
+//  }
+
+  /* will not quit on error unless override ignoreErrors = false */
+  def autoDetectDelimiter(sampleText: String, fname: String, ignoreErrors: Boolean = true): String = {
+    var (tabs, commas, semis, pipes) = (0, 0, 0, 0)
+    sampleText.toCharArray.foreach {
+      case '\t' => tabs += 1
+      case ','  => commas += 1
+      case ';'  => semis += 1
+      case '|'  => pipes += 1
+      case _    =>
+    }
+    // Premise:
+    //   tab-delimited files contain more tabs than commas,
+    //   comma-delimited files contain more commas than tabs.
+    // Provides a reasonably fast guess, but can potentially fail.
+    //
+    // A much slower but more thorough approach would be:
+    //    1. replaceAll("""(?m)"[^"]*", "") // remove quoted strings
+    //    2. split("[\r\n]+") // extract multiple lines
+    //    3. count columns-per-row tallies using various delimiters
+    //    4. the tally with the most consistency is the "winner"
+    (commas, tabs, pipes, semis) match {
+      // in case of a tie between commas and tabs, commas win (TODO: configurable)
+    case (cms, tbs, pps, sms) if cms >= tbs && cms >= pps && cms >= sms  => ","
+    case (cms, tbs, pps, sms) if tbs >= cms && tbs >= pps && tbs >= sms => "\t"
+    case (cms, tbs, pps, sms) if pps > cms && pps > tbs && pps > sms    => "|"
+    case (cms, tbs, pps, sms) if sms > cms && sms > tbs && sms > pps    => ";"
+
+    case _ if ignoreErrors => ""
+
+    case _ =>
+      sys.error(
+        s"unable to choose delimiter: tabs[$tabs], commas[$commas], semis[$semis], pipes[$pipes] for file:\n[${fname}]"
+      )
+    }
+  }
+}
+
+case class FastCsvToto(val reader: Reader, identifier: String, delimiter: String) {
+  if (delimiter.length != 1) {
+    System.err.printf("warning: only sees the first character of the delimiter [%s]\n", delimiter)
+  }
+
+  def delim: Char = delimiter match {
+  case ""   => ' ' // treat rows with no delimiter as a single column
+  case ","  => ','
+  case "\t" => '\t'
+  case "|"  => '|'
+  case ";"  => ';'
+  case _    => delimiter.charAt(0)
+  }
+
+  def rawrows: Seq[Seq[String]] = iterator.toSeq.filter { (cols: Seq[String]) => cols != Seq("") } // discard gratuitous empty rows
+  def rows                      = rawrows.map { row => row.map(_.trim) }
+  def rowstrimmed               = rows
+
+  // def stream = CsvParser.separator(delim).iterator(reader).asScala.iterator
+  override def toString = identifier
+
+  import java.io.BufferedReader
+  import scala.util.Using
+  val br: BufferedReader = new BufferedReader(reader)
+
+  inline def iterateLines: Iterator[String] = Iterator.continually(readLine).takeWhile { _ != null }
+
+  class csvFormat extends CSVFormat {
+    val delimiter: Char = delim
+    val quoteChar: Char = '"'
+    val escapeChar: Char = '"'
+    val lineTerminator: String = "\n" // only used by tototoshi CSVWriter
+    val quoting: Quoting = QUOTE_MINIMAL
+    val treatEmptyLineAsNil: Boolean = false
+  }
+
+  lazy val csvParser = new CSVParser(new csvFormat)
+
+  inline def iterator: Iterator[Seq[String]] = {
+    for {
+      line <- iterateLines
+      // cols = CSVParser.parse(line, escapeChar, delimiterChar, quoteChar) match {
+      colsopt = csvParser.parseLine(line)
+      if colsopt != None
+    } yield colsopt.get
+  }
+
+  inline def readLine: String = {
+    val sb = new StringBuilder()
+    var c: Int = 0
+    def cc: Char = c.asInstanceOf[Char]
+    def nonEOL: Boolean  = c != -1 && c != '\n' && c != '\u2028' && c != '\u2029' && c != '\u0085'
+
+    while (nonEOL) {
+      c = br.read()
+      if (c != -1) {
+        sb.append(cc)
+        if (nonEOL) {
+          if (c == '\r') {
+            br.mark(1)
+            c = br.read()
+            if (c != -1) {
+              if (c == '\n') {
+                sb.append('\n')
+              } else {
+                br.reset()
+              }
+            }
+          }
+        }
+      }
+    }
+    if (sb.isEmpty) {
+      null
+    } else { 
+      sb.toString()
+    }
+  }
+}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		ThisBuild / scalaVersion := "2.12.19"
		ThisBuild / scalaVersion := "2.12.20"