diff --git a/pom.xml b/pom.xml index 7c96e581..6d962135 100644 --- a/pom.xml +++ b/pom.xml @@ -107,6 +107,14 @@ + + + + cats. + cats.shaded. + + + @@ -551,6 +559,11 @@ hadoop-aws ${hadoop.version} + + io.lemonlabs + scala-uri_${scala.binary.version} + 3.5.0 + diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala index c609c837..4b887141 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala @@ -15,31 +15,35 @@ */ package io.archivesunleashed.matchbox +import io.lemonlabs.uri.Url +import io.lemonlabs.uri.config.UriConfig +import io.lemonlabs.uri.decoding.PercentDecoder import java.net.URL /** Extracts the host domain name from a full url string. */ object ExtractDomain { + implicit val c: UriConfig = UriConfig( + decoder = PercentDecoder(ignoreInvalidPercentEncoding = true) + ) + /** Extract source domains from a full url string. * * @param url a url as a string - * @param source an optional default url for urls with no valid domain host * @return domain host, source or null if url is null. */ - def apply(url: String, source: String = ""): String = { - val maybeHost: Option[URL] = checkUrl(url) - val maybeSource: Option[URL] = checkUrl(source) - maybeHost match { - case Some(host) => - host.getHost - - case None => - maybeSource match { - case Some(source) => - source.getHost - case None => + def apply(url: String): String = { + val maybeUri: Option[URL] = checkUrl(url) + maybeUri match { + case Some(uri) => + try { + Url.parse(uri.toString).apexDomain.mkString + } catch { + case e: Exception => "" } + case None => + "" } } diff --git a/src/main/scala/io/archivesunleashed/udfs/package.scala b/src/main/scala/io/archivesunleashed/udfs/package.scala index 30e6000e..bc712e36 100644 --- a/src/main/scala/io/archivesunleashed/udfs/package.scala +++ b/src/main/scala/io/archivesunleashed/udfs/package.scala @@ -53,7 +53,7 @@ package object udfs extends Serializable { def extractDate: UserDefinedFunction = udf(ExtractDate.apply(_: String, _: String)) def extractDomain: UserDefinedFunction = - udf(ExtractDomain.apply(_: String, "")) + udf(ExtractDomain.apply(_: String)) def extractImageLinks: UserDefinedFunction = udf(ExtractImageLinks.apply(_: String, _: String)) def extractLinks: UserDefinedFunction = diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala index 29d461dd..bca3a8fa 100644 --- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala +++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala @@ -33,7 +33,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter { private val exampleArc = "example.arc.gz" private val exampleWarc = "example.warc.gz" private val exampleDate = "20080430" - private val exampleUrl = "www.archive.org" + private val exampleUrl = "archive.org" private val exampleStatusCode1 = "000" private val exampleStatusCode2 = "200" private val exampleMimeType = "text/plain" diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala index bf8a1b87..17b0725c 100644 --- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala @@ -130,7 +130,7 @@ class RecordDFTest extends FunSuite with BeforeAndAfter { .loadArchives(arcPath, sc) .all() .select($"url") - .filter(hasDomains(extractDomain($"url"), lit(Array("www.archive.org")))) + .filter(hasDomains(extractDomain($"url"), lit(Array("archive.org")))) .take(1)(0)(0) assert(base1.toString == expected) diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala index a324aa33..f4d6eb4e 100644 --- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala +++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala @@ -111,7 +111,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { val base2 = RecordLoader .loadArchives(arcPath, sc) .keepValidPages() - val urls: Set[String] = Set("www.archive.org", "www.sloan.org") + val urls: Set[String] = Set("archive.org", "sloan.org") val x2 = base2.keepDomains(urls).count() assert(x2 == expected) } @@ -259,7 +259,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter { val base = RecordLoader .loadArchives(arcPath, sc) .keepValidPages() - val urls: Set[String] = Set("www.sloan.org") + val urls: Set[String] = Set("sloan.org") val r2 = base.discardDomains(urls).count() assert(r2 == expected) } diff --git a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala index e842ca9b..a5319637 100644 --- a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala +++ b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala @@ -48,12 +48,12 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter { assert(dfResult(0).get(0) == "20080430") assert(dfResult(0).get(1) == "archive.org") assert(dfResult(0).get(2) == "archive.org") - assert(dfResult(0).get(3) == 37477) + assert(dfResult(0).get(3) == 37511) assert(dfResult(1).get(0) == "20080430") assert(dfResult(1).get(1) == "archive.org") - assert(dfResult(1).get(2) == "wiki.etree.org") - assert(dfResult(1).get(3) == 21) + assert(dfResult(1).get(2) == "etree.org") + assert(dfResult(1).get(3) == 31) } after { diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala index b39da1d9..5a6574c6 100644 --- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala +++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala @@ -61,18 +61,18 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter { // +------------------+-----+ // | Domain|count| // +------------------+-----+ - // | www.archive.org| 91| + // | archive.org| 91| // | deadlists.com| 2| - // |www.hideout.com.br| 1| + // | hideout.com.br| 1| // +------------------+-----+ - assert(results(0).get(0) == "www.archive.org") + assert(results(0).get(0) == "archive.org") assert(results(0).get(1) == 91) assert(results(1).get(0) == "deadlists.com") assert(results(1).get(1) == 2) - assert(results(2).get(0) == "www.hideout.com.br") + assert(results(2).get(0) == "hideout.com.br") assert(results(2).get(1) == 1) } diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala index fb0af680..1d8e568b 100644 --- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala +++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala @@ -23,7 +23,7 @@ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class ExtractDomainTest extends FunSuite { private val index = "index.html" - private val umiacs = "www.umiacs.umd.edu" + private val umiacs = "umd.edu" private val jimmylin = "http://www.umiacs.umd.edu/~jimmylin/" private val lintool = "https://github.com/lintool" private val github = "github.com" @@ -52,7 +52,7 @@ class ExtractDomainTest extends FunSuite { .+=( ( "http://www.seetorontonow.canada-booknow.com\\booking_results.php", - "www.seetorontonow.canada-booknow.com" + "canada-booknow.com" ) ) .result() @@ -63,16 +63,10 @@ class ExtractDomainTest extends FunSuite { } } - test("Extract domains with base RDD") { - data2.foreach { - case (link, base, domain) => assert(ExtractDomain(link, base) == domain) - } - } - test("Test for domain errors RDD") { // scalastyle:off null assert(ExtractDomain(null) == "") - assert(ExtractDomain(index, null) == "") + assert(ExtractDomain("") == "") // scalastyle:on null }