diff --git a/pom.xml b/pom.xml
index 45f34f32..e9e20485 100644
--- a/pom.xml
+++ b/pom.xml
@@ -108,14 +108,6 @@
-
-
-
- cats.
- cats.shaded.
-
-
-
@@ -584,11 +576,6 @@
hadoop-aws
${hadoop.version}
-
- io.lemonlabs
- scala-uri_${scala.binary.version}
- 3.5.0
-
diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
index 4b887141..29d14350 100644
--- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
+++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala
@@ -15,17 +15,29 @@
*/
package io.archivesunleashed.matchbox
-import io.lemonlabs.uri.Url
-import io.lemonlabs.uri.config.UriConfig
-import io.lemonlabs.uri.decoding.PercentDecoder
import java.net.URL
/** Extracts the host domain name from a full url string. */
object ExtractDomain {
-
- implicit val c: UriConfig = UriConfig(
- decoder = PercentDecoder(ignoreInvalidPercentEncoding = true)
- )
+ lazy val Suffixes: Set[String] = {
+ val source = scala.io.Source
+ .fromURL(
+ "https://publicsuffix.org/list/public_suffix_list.dat",
+ "utf-8"
+ )
+ try {
+ source.getLines
+ .map(_.trim)
+ .filter(_.nonEmpty)
+ .filter(!_.startsWith("//"))
+ .toSet
+ } catch {
+ case _: Exception =>
+ Set.empty
+ } finally {
+ source.close()
+ }
+ }
/** Extract source domains from a full url string.
*
@@ -33,20 +45,35 @@ object ExtractDomain {
* @return domain host, source or null if url is null.
*/
def apply(url: String): String = {
- val maybeUri: Option[URL] = checkUrl(url)
- maybeUri match {
- case Some(uri) =>
- try {
- Url.parse(uri.toString).apexDomain.mkString
- } catch {
- case e: Exception =>
- ""
- }
+
+ val maybeUrl: Option[URL] = checkUrl(url)
+
+ maybeUrl match {
+
+ case Some(url) =>
+ val host = url.getHost.mkString
+ resolve(host)
case None =>
""
}
}
+ def resolve(host: String): String = resolve(host, Suffixes)
+
+ def resolve(host: String, suffixes: Set[String]): String = {
+ val hostSplit = host.split('.')
+ hostSplit.tails
+ .filter(_.length > 1)
+ .find { domain =>
+ val suffix = domain.tail
+ suffixes.contains(suffix.mkString(".")) || (suffix.length > 1 && {
+ suffixes.contains("*." + suffix.tail.mkString("."))
+ })
+ }
+ .getOrElse(hostSplit)
+ .mkString(".")
+ }
+
def checkUrl(url: String): Option[URL] = {
try {
Some(new URL(url.replace("\\", "/")))