diff --git a/pom.xml b/pom.xml index 45f34f32..e9e20485 100644 --- a/pom.xml +++ b/pom.xml @@ -108,14 +108,6 @@ - - - - cats. - cats.shaded. - - - @@ -584,11 +576,6 @@ hadoop-aws ${hadoop.version} - - io.lemonlabs - scala-uri_${scala.binary.version} - 3.5.0 - diff --git a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala index 4b887141..29d14350 100644 --- a/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala +++ b/src/main/scala/io/archivesunleashed/matchbox/ExtractDomain.scala @@ -15,17 +15,29 @@ */ package io.archivesunleashed.matchbox -import io.lemonlabs.uri.Url -import io.lemonlabs.uri.config.UriConfig -import io.lemonlabs.uri.decoding.PercentDecoder import java.net.URL /** Extracts the host domain name from a full url string. */ object ExtractDomain { - - implicit val c: UriConfig = UriConfig( - decoder = PercentDecoder(ignoreInvalidPercentEncoding = true) - ) + lazy val Suffixes: Set[String] = { + val source = scala.io.Source + .fromURL( + "https://publicsuffix.org/list/public_suffix_list.dat", + "utf-8" + ) + try { + source.getLines + .map(_.trim) + .filter(_.nonEmpty) + .filter(!_.startsWith("//")) + .toSet + } catch { + case _: Exception => + Set.empty + } finally { + source.close() + } + } /** Extract source domains from a full url string. * @@ -33,20 +45,35 @@ object ExtractDomain { * @return domain host, source or null if url is null. */ def apply(url: String): String = { - val maybeUri: Option[URL] = checkUrl(url) - maybeUri match { - case Some(uri) => - try { - Url.parse(uri.toString).apexDomain.mkString - } catch { - case e: Exception => - "" - } + + val maybeUrl: Option[URL] = checkUrl(url) + + maybeUrl match { + + case Some(url) => + val host = url.getHost.mkString + resolve(host) case None => "" } } + def resolve(host: String): String = resolve(host, Suffixes) + + def resolve(host: String, suffixes: Set[String]): String = { + val hostSplit = host.split('.') + hostSplit.tails + .filter(_.length > 1) + .find { domain => + val suffix = domain.tail + suffixes.contains(suffix.mkString(".")) || (suffix.length > 1 && { + suffixes.contains("*." + suffix.tail.mkString(".")) + }) + } + .getOrElse(hostSplit) + .mkString(".") + } + def checkUrl(url: String): Option[URL] = { try { Some(new URL(url.replace("\\", "/")))