From 4e20e9c95445e752f0dea48081942294dbdba5e9 Mon Sep 17 00:00:00 2001 From: nruest Date: Sun, 3 Oct 2021 23:47:26 -0400 Subject: [PATCH 1/2] Update ExtractDomain to extract apex domains. - Resolves #519 - Add scala-uri as a dependency - Replace getHost method of extracting domains with apexDomain from scala-uri - Update tests as needed - Removed unused source parameter from ExtractDomain --- pom.xml | 5 +++++ .../matchbox/ExtractDomain.scala | 21 +++++++------------ .../io/archivesunleashed/udfs/package.scala | 2 +- .../archivesunleashed/ArchiveRecordTest.scala | 2 +- .../io/archivesunleashed/RecordDFTest.scala | 2 +- .../io/archivesunleashed/RecordRDDTest.scala | 4 ++-- .../app/DomainGraphExtractorTest.scala | 6 +++--- .../archivesunleashed/df/SimpleDfTest.scala | 8 +++---- .../matchbox/ExtractDomainTest.scala | 12 +++-------- 9 files changed, 27 insertions(+), 35 deletions(-) diff --git a/pom.xml b/pom.xml index 7c96e581..95f035c2 100644 --- a/pom.xml +++ b/pom.xml @@ -551,6 +551,11 @@ hadoop-aws ${hadoop.version} + + io.lemonlabs + scala-uri_${scala.binary.version} + 3.5.0 + diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala index c609c837..7b922012 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala @@ -15,6 +15,7 @@ */ package io.archivesunleashed.matchbox +import io.lemonlabs.uri.Url import java.net.URL /** Extracts the host domain name from a full url string. */ @@ -23,23 +24,15 @@ object ExtractDomain { /** Extract source domains from a full url string. * * @param url a url as a string - * @param source an optional default url for urls with no valid domain host * @return domain host, source or null if url is null. */ - def apply(url: String, source: String = ""): String = { - val maybeHost: Option[URL] = checkUrl(url) - val maybeSource: Option[URL] = checkUrl(source) - maybeHost match { - case Some(host) => - host.getHost - + def apply(url: String): String = { + val maybeUri: Option[URL] = checkUrl(url) + maybeUri match { + case Some(uri) => + Url.parse(uri.toString).apexDomain.mkString case None => - maybeSource match { - case Some(source) => - source.getHost - case None => - "" - } + "" } } diff --git a/src/main/scala/io/archivesunleashed/udfs/package.scala b/src/main/scala/io/archivesunleashed/udfs/package.scala index 30e6000e..bc712e36 100644 --- a/src/main/scala/io/archivesunleashed/udfs/package.scala +++ b/src/main/scala/io/archivesunleashed/udfs/package.scala @@ -53,7 +53,7 @@ package object udfs extends Serializable { def extractDate: UserDefinedFunction = udf(ExtractDate.apply(_: String, _: String)) def extractDomain: UserDefinedFunction = - udf(ExtractDomain.apply(_: String, "")) + udf(ExtractDomain.apply(_: String)) def extractImageLinks: UserDefinedFunction = udf(ExtractImageLinks.apply(_: String, _: String)) def extractLinks: UserDefinedFunction = diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index 29d461dd..bca3a8fa 100644 --- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -33,7 +33,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { private val exampleArc = "example.arc.gz" private val exampleWarc = "example.warc.gz" private val exampleDate = "20080430" - private val exampleUrl = "www.archive.org" + private val exampleUrl = "archive.org" private val exampleStatusCode1 = "000" private val exampleStatusCode2 = "200" private val exampleMimeType = "text/plain" diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala index bf8a1b87..17b0725c 100644 --- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala @@ -130,7 +130,7 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { .loadArchives(arcPath, sc) .all() .select($"url") - .filter(hasDomains(extractDomain($"url"), lit(Array("www.archive.org")))) + .filter(hasDomains(extractDomain($"url"), lit(Array("archive.org")))) .take(1)(0)(0) assert(base1.toString == expected) diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala index a324aa33..f4d6eb4e 100644 --- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -111,7 +111,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { val base2 = RecordLoader .loadArchives(arcPath, sc) .keepValidPages() - val urls: Set[String] = Set("www.archive.org", "www.sloan.org") + val urls: Set[String] = Set("archive.org", "sloan.org") val x2 = base2.keepDomains(urls).count() assert(x2 == expected) } @@ -259,7 +259,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { val base = RecordLoader .loadArchives(arcPath, sc) .keepValidPages() - val urls: Set[String] = Set("www.sloan.org") + val urls: Set[String] = Set("sloan.org") val r2 = base.discardDomains(urls).count() assert(r2 == expected) } diff --git a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala index e842ca9b..a5319637 100644 --- a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala @@ -48,12 +48,12 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter { assert(dfResult(0).get(0) == "20080430") assert(dfResult(0).get(1) == "archive.org") assert(dfResult(0).get(2) == "archive.org") - assert(dfResult(0).get(3) == 37477) + assert(dfResult(0).get(3) == 37511) assert(dfResult(1).get(0) == "20080430") assert(dfResult(1).get(1) == "archive.org") - assert(dfResult(1).get(2) == "wiki.etree.org") - assert(dfResult(1).get(3) == 21) + assert(dfResult(1).get(2) == "etree.org") + assert(dfResult(1).get(3) == 31) } after { diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala index b39da1d9..5a6574c6 100644 --- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala +++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala @@ -61,18 +61,18 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter { // +------------------+-----+ // | Domain|count| // +------------------+-----+ - // | www.archive.org| 91| + // | archive.org| 91| // | deadlists.com| 2| - // |www.hideout.com.br| 1| + // | hideout.com.br| 1| // +------------------+-----+ - assert(results(0).get(0) == "www.archive.org") + assert(results(0).get(0) == "archive.org") assert(results(0).get(1) == 91) assert(results(1).get(0) == "deadlists.com") assert(results(1).get(1) == 2) - assert(results(2).get(0) == "www.hideout.com.br") + assert(results(2).get(0) == "hideout.com.br") assert(results(2).get(1) == 1) } diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala index fb0af680..1d8e568b 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala @@ -23,7 +23,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ExtractDomainTest extends FunSuite { private val index = "index.html" - private val umiacs = "www.umiacs.umd.edu" + private val umiacs = "umd.edu" private val jimmylin = "http://www.umiacs.umd.edu/~jimmylin/" private val lintool = "https://github.com/lintool" private val github = "github.com" @@ -52,7 +52,7 @@ class ExtractDomainTest extends FunSuite { .+=( ( "http://www.seetorontonow.canada-booknow.com\\booking_results.php", - "www.seetorontonow.canada-booknow.com" + "canada-booknow.com" ) ) .result() @@ -63,16 +63,10 @@ class ExtractDomainTest extends FunSuite { } } - test("Extract domains with base RDD") { - data2.foreach { - case (link, base, domain) => assert(ExtractDomain(link, base) == domain) - } - } - test("Test for domain errors RDD") { // scalastyle:off null assert(ExtractDomain(null) == "") - assert(ExtractDomain(index, null) == "") + assert(ExtractDomain("") == "") // scalastyle:on null } From 9a343e930eae9f0974bafaa0c7d6afbf859c94e5 Mon Sep 17 00:00:00 2001 From: nruest Date: Thu, 21 Oct 2021 10:58:13 -0400 Subject: [PATCH 2/2] Shade cats, and catch Url.parse exceptions. - Caused by: io.lemonlabs.uri.parsing.UriParsingException: Invalid URL could not be parsed. Error(4,NonEmptyList(EndOfString(4,146))) --- pom.xml | 8 ++++++++ .../archivesunleashed/matchbox/ExtractDomain.scala | 13 ++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 95f035c2..6d962135 100644 --- a/pom.xml +++ b/pom.xml @@ -107,6 +107,14 @@ + + + + cats. + cats.shaded. + + + diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala index 7b922012..4b887141 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala @@ -16,11 +16,17 @@ package io.archivesunleashed.matchbox import io.lemonlabs.uri.Url +import io.lemonlabs.uri.config.UriConfig +import io.lemonlabs.uri.decoding.PercentDecoder import java.net.URL /** Extracts the host domain name from a full url string. */ object ExtractDomain { + implicit val c: UriConfig = UriConfig( + decoder = PercentDecoder(ignoreInvalidPercentEncoding = true) + ) + /** Extract source domains from a full url string. * * @param url a url as a string @@ -30,7 +36,12 @@ object ExtractDomain { val maybeUri: Option[URL] = checkUrl(url) maybeUri match { case Some(uri) => - Url.parse(uri.toString).apexDomain.mkString + try { + Url.parse(uri.toString).apexDomain.mkString + } catch { + case e: Exception => + "" + } case None => "" }