diff --git a/pom.xml b/pom.xml
index 7c96e581..6d962135 100644
--- a/pom.xml
+++ b/pom.xml
@@ -107,6 +107,14 @@
+
+
+
+ cats.
+ cats.shaded.
+
+
+
@@ -551,6 +559,11 @@
hadoop-aws
${hadoop.version}
+
+ io.lemonlabs
+ scala-uri_${scala.binary.version}
+ 3.5.0
+
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
index c609c837..4b887141 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
@@ -15,31 +15,35 @@
*/
package io.archivesunleashed.matchbox
+import io.lemonlabs.uri.Url
+import io.lemonlabs.uri.config.UriConfig
+import io.lemonlabs.uri.decoding.PercentDecoder
import java.net.URL
/** Extracts the host domain name from a full url string. */
object ExtractDomain {
+ implicit val c: UriConfig = UriConfig(
+ decoder = PercentDecoder(ignoreInvalidPercentEncoding = true)
+ )
+
/** Extract source domains from a full url string.
*
* @param url a url as a string
- * @param source an optional default url for urls with no valid domain host
* @return domain host, source or null if url is null.
*/
- def apply(url: String, source: String = ""): String = {
- val maybeHost: Option[URL] = checkUrl(url)
- val maybeSource: Option[URL] = checkUrl(source)
- maybeHost match {
- case Some(host) =>
- host.getHost
-
- case None =>
- maybeSource match {
- case Some(source) =>
- source.getHost
- case None =>
+ def apply(url: String): String = {
+ val maybeUri: Option[URL] = checkUrl(url)
+ maybeUri match {
+ case Some(uri) =>
+ try {
+ Url.parse(uri.toString).apexDomain.mkString
+ } catch {
+ case e: Exception =>
""
}
+ case None =>
+ ""
}
}
diff --git a/src/main/scala/io/archivesunleashed/udfs/package.scala b/src/main/scala/io/archivesunleashed/udfs/package.scala
index 30e6000e..bc712e36 100644
--- a/src/main/scala/io/archivesunleashed/udfs/package.scala
+++ b/src/main/scala/io/archivesunleashed/udfs/package.scala
@@ -53,7 +53,7 @@ package object udfs extends Serializable {
def extractDate: UserDefinedFunction =
udf(ExtractDate.apply(_: String, _: String))
def extractDomain: UserDefinedFunction =
- udf(ExtractDomain.apply(_: String, ""))
+ udf(ExtractDomain.apply(_: String))
def extractImageLinks: UserDefinedFunction =
udf(ExtractImageLinks.apply(_: String, _: String))
def extractLinks: UserDefinedFunction =
diff --git a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
index 29d461dd..bca3a8fa 100644
--- a/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
+++ b/src/test/scala/io/archivesunleashed/ArchiveRecordTest.scala
@@ -33,7 +33,7 @@ class ArchiveRecordTest extends FunSuite with BeforeAndAfter {
private val exampleArc = "example.arc.gz"
private val exampleWarc = "example.warc.gz"
private val exampleDate = "20080430"
- private val exampleUrl = "www.archive.org"
+ private val exampleUrl = "archive.org"
private val exampleStatusCode1 = "000"
private val exampleStatusCode2 = "200"
private val exampleMimeType = "text/plain"
diff --git a/src/test/scala/io/archivesunleashed/RecordDFTest.scala b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
index bf8a1b87..17b0725c 100644
--- a/src/test/scala/io/archivesunleashed/RecordDFTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordDFTest.scala
@@ -130,7 +130,7 @@ class RecordDFTest extends FunSuite with BeforeAndAfter {
.loadArchives(arcPath, sc)
.all()
.select($"url")
- .filter(hasDomains(extractDomain($"url"), lit(Array("www.archive.org"))))
+ .filter(hasDomains(extractDomain($"url"), lit(Array("archive.org"))))
.take(1)(0)(0)
assert(base1.toString == expected)
diff --git a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
index a324aa33..f4d6eb4e 100644
--- a/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
+++ b/src/test/scala/io/archivesunleashed/RecordRDDTest.scala
@@ -111,7 +111,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
val base2 = RecordLoader
.loadArchives(arcPath, sc)
.keepValidPages()
- val urls: Set[String] = Set("www.archive.org", "www.sloan.org")
+ val urls: Set[String] = Set("archive.org", "sloan.org")
val x2 = base2.keepDomains(urls).count()
assert(x2 == expected)
}
@@ -259,7 +259,7 @@ class RecordRDDTest extends FunSuite with BeforeAndAfter {
val base = RecordLoader
.loadArchives(arcPath, sc)
.keepValidPages()
- val urls: Set[String] = Set("www.sloan.org")
+ val urls: Set[String] = Set("sloan.org")
val r2 = base.discardDomains(urls).count()
assert(r2 == expected)
}
diff --git a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala
index e842ca9b..a5319637 100644
--- a/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala
+++ b/src/test/scala/io/archivesunleashed/app/DomainGraphExtractorTest.scala
@@ -48,12 +48,12 @@ class DomainGraphExtractorDfTest extends FunSuite with BeforeAndAfter {
assert(dfResult(0).get(0) == "20080430")
assert(dfResult(0).get(1) == "archive.org")
assert(dfResult(0).get(2) == "archive.org")
- assert(dfResult(0).get(3) == 37477)
+ assert(dfResult(0).get(3) == 37511)
assert(dfResult(1).get(0) == "20080430")
assert(dfResult(1).get(1) == "archive.org")
- assert(dfResult(1).get(2) == "wiki.etree.org")
- assert(dfResult(1).get(3) == 21)
+ assert(dfResult(1).get(2) == "etree.org")
+ assert(dfResult(1).get(3) == 31)
}
after {
diff --git a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
index b39da1d9..5a6574c6 100644
--- a/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
+++ b/src/test/scala/io/archivesunleashed/df/SimpleDfTest.scala
@@ -61,18 +61,18 @@ class SimpleDfTest extends FunSuite with BeforeAndAfter {
// +------------------+-----+
// | Domain|count|
// +------------------+-----+
- // | www.archive.org| 91|
+ // | archive.org| 91|
// | deadlists.com| 2|
- // |www.hideout.com.br| 1|
+ // | hideout.com.br| 1|
// +------------------+-----+
- assert(results(0).get(0) == "www.archive.org")
+ assert(results(0).get(0) == "archive.org")
assert(results(0).get(1) == 91)
assert(results(1).get(0) == "deadlists.com")
assert(results(1).get(1) == 2)
- assert(results(2).get(0) == "www.hideout.com.br")
+ assert(results(2).get(0) == "hideout.com.br")
assert(results(2).get(1) == 1)
}
diff --git a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
index fb0af680..1d8e568b 100644
--- a/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
+++ b/src/test/scala/io/archivesunleashed/matchbox/ExtractDomainTest.scala
@@ -23,7 +23,7 @@ import org.scalatest.junit.JUnitRunner
@RunWith(classOf[JUnitRunner])
class ExtractDomainTest extends FunSuite {
private val index = "index.html"
- private val umiacs = "www.umiacs.umd.edu"
+ private val umiacs = "umd.edu"
private val jimmylin = "http://www.umiacs.umd.edu/~jimmylin/"
private val lintool = "https://github.com/lintool"
private val github = "github.com"
@@ -52,7 +52,7 @@ class ExtractDomainTest extends FunSuite {
.+=(
(
"http://www.seetorontonow.canada-booknow.com\\booking_results.php",
- "www.seetorontonow.canada-booknow.com"
+ "canada-booknow.com"
)
)
.result()
@@ -63,16 +63,10 @@ class ExtractDomainTest extends FunSuite {
}
}
- test("Extract domains with base RDD") {
- data2.foreach {
- case (link, base, domain) => assert(ExtractDomain(link, base) == domain)
- }
- }
-
test("Test for domain errors RDD") {
// scalastyle:off null
assert(ExtractDomain(null) == "")
- assert(ExtractDomain(index, null) == "")
+ assert(ExtractDomain("") == "")
// scalastyle:on null
}