Skip to content

Commit

Permalink
refactor and map to Entry
Browse files Browse the repository at this point in the history
  • Loading branch information
AesaKamar committed Jul 24, 2018
1 parent fadcfba commit b269890
Show file tree
Hide file tree
Showing 4 changed files with 97 additions and 43 deletions.
4 changes: 3 additions & 1 deletion build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ lazy val root = (project in file(".")).settings(
"org.typelevel" %% "cats-effect" % "1.0.0-RC2",
"org.typelevel" %% "cats-core" % "1.1.0",
"com.lihaoyi" %% "pprint" % "0.5.3",
"com.lihaoyi" %% "fastparse" % "1.0.0"
"com.lihaoyi" %% "fastparse" % "0.4.2",
"com.github.pathikrit" %% "better-files" % "3.5.0",
"com.github.tomtung" %% "latex2unicode" % "0.2.4"
)
)
File renamed without changes.
121 changes: 82 additions & 39 deletions src/main/scala/example/Scraper.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,52 +13,84 @@ object Scraper {
import net.ruippeixotog.scalascraper.model._

val browser = JsoupBrowser()
val doc = browser.parseFile("assets/00.html")

val content = doc >> element("div #content")
val entries =
(content >> elementList("div div"))
.filter(x => x.hasAttr("class") && x.attr("class").startsWith("indent"))
.map { x =>
val key = (x >> elements("strong")).map(_.text).headOption
val indentLevel =
(x >> elements("div")).map(_.attr("class")).map(_.lastOption.map(_.asDigit)).headOption.flatten
val description =
x.childNodes
.filter(_.isInstanceOf[TextNode])
.map { case TextNode(c) => c }
.toList
.fold("")(_.concat(_))
.trim
val links = (x >> elementList("a"))
.map(_.text.trim)

(indentLevel, key, description, links)
}
def parseFile(fileName: String) = {
val doc = browser.parseFile(fileName)

val parsedEntries = entries
.map {
case (indentLevel, key, description, links) =>
import Parser._
val identifier = key
.map(x => idParser.parse(x))
.flatMap {
case Success(v, _) => Some(v)
case Failure(_, _, _) => None
}
val parsedLinks = links
.map(x => idParser.parse(x))
.flatMap {
case Success(v, _) => Some(v)
case Failure(_, _, _) => None
val content = doc >> element("div #content")

val entries =
(content >> elementList("div div"))
.filter(x => x.hasAttr("class") && x.attr("class").startsWith("indent"))
.map { x =>
val key = (x >> elements("strong")).map(_.text).headOption
val indentLevel =
(x >> elements("div")).map(_.attr("class")).map(_.lastOption.map(_.asDigit)).headOption.flatten
val description = {
val latexedDescriptions = x.childNodes
.map {
case TextNode(" ") => None
case TextNode(c) => Some(NormalDescription(c))
case ElementNode(e) if (e >> elements("script")).nonEmpty => {
val r = (e >> elements("script")).map(_.innerHtml)
r.headOption.map(MathTexDescription)
}
case _ => None
}
.toList
.flatten
resolveDescription(latexedDescriptions)
}

(indentLevel, identifier, description, parsedLinks)
val links = (x >> elementList("a"))
.map(_.text.trim)

(indentLevel, key, description, links)
}

val parsedEntries = entries
.map {
case (indentLevel, key, description, links) =>
import Parser._
val identifier = key
.map(x => idParser.parse(x))
.flatMap {
case Success(v, _) => Some(v)
case Failure(_, _, _) => None
}
val parsedLinks = links
.map(x => idParser.parse(x))
.flatMap {
case Success(v, _) => Some(v)
case Failure(_, _, _) => None
}

(indentLevel, identifier, description, parsedLinks)
}
.filter(_._2.nonEmpty)

parsedEntries
}

def resolveDescription(descriptions: List[Description]): NormalDescription = {
import com.github.tomtung.latex2unicode._
val newDescs = descriptions.map {
case d: NormalDescription => d
case MathTexDescription(d) => NormalDescription(LaTeX2Unicode.convert(d))
}
.filter(_._2.nonEmpty)
val accumulated =
newDescs.foldLeft(new StringBuilder) { case (acc, NormalDescription(s)) => acc.append(s) }.toString().trim

val withoutSeeAlsoAnnotations = accumulated.replaceAll("\\[.*\\]", "")

NormalDescription(withoutSeeAlsoAnnotations.trim)
}

}

sealed trait Description
final case class NormalDescription(value: String) extends Description
final case class MathTexDescription(value: String) extends Description

sealed trait Identifier
final case class Area(id: String) extends Identifier
final case class SubArea(
Expand All @@ -85,3 +117,14 @@ object Parser {
val idParser: Parser[Identifier] = P(specParser | subAreaParser | areaParser)

}

final case class Entry(
level: Int,
identifier: Identifier,
description: String,
links: List[Identifier])
object Entry {
def fromTuple(t: (Option[Int], Option[Identifier], NormalDescription, List[Identifier])) = t match {
case (Some(level), Some(id), NormalDescription(desc), links) => Entry(level, id, desc, links)
}
}
15 changes: 12 additions & 3 deletions src/test/scala/example/ParserTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,22 @@ package example
import org.scalatest._
import pprint._
import org.scalatest.EitherValues._
import org.scalatest.OptionValues._

class ScraperTest extends AsyncFreeSpec with Matchers {
import Scraper._
"Grabbing an html document" in {
val response = doc

parsedEntries.map(x => pprintln(x))
parseFile("assets/57.html")
.map(Entry.fromTuple)
.map(x => pprintln(x))
succeed
}
"listing files" ignore {
import better.files._
import better.files.Dsl._
import File._
val files = ls(file"assets/")
pprintln(files.toList.map(_.path.toString).sorted)
succeed
}
}

0 comments on commit b269890

Please sign in to comment.