From 1b9009993da90aec26113c5faa58c5813f56fa62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Tue, 8 Oct 2024 13:08:26 +0100
Subject: [PATCH] Apply dist_thresh to all search backends

This commit introduces a distance threshold mechanism to the Genius
backend and unifies its implementation across the rest of backends that
perform searching and matching artists and titles.

- Create a new `SearchBackend` base class with a method `check_match`
  that performs checking.
- Start using undocumented `dist_thresh` configuration option for good,
  and mention it in the docs. This controls the maximum allowable
  distance for matching artist and title names.

These changes aim to improve the accuracy of lyrics matching, especially
when there are slight variations in artist or title names, see #4791.
---
 beetsplug/lyrics.py         | 131 +++++++++++++++++++-----------------
 docs/changelog.rst          |   3 +
 docs/plugins/lyrics.rst     |   6 ++
 test/plugins/test_lyrics.py |  59 +++++++++++++---
 4 files changed, 126 insertions(+), 73 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index ccc8cae75f..22f7ec4275 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -14,7 +14,6 @@
 
 """Fetches, embeds, and displays lyrics."""
 
-import difflib
 import errno
 import itertools
 import json
@@ -24,6 +23,7 @@
 import unicodedata
 import urllib
 import warnings
+from functools import cached_property, partial
 
 import requests
 from unidecode import unidecode
@@ -356,15 +356,43 @@ def fetch(self, artist, title, album=None, length=None):
         return lyrics
 
 
-class Genius(Backend):
+class SearchBackend(Backend):
+    REQUIRES_BS = True
+
+    @cached_property
+    def dist_thresh(self) -> float:
+        return self.config["dist_thresh"].get(float)
+
+    def check_match(
+        self, target_artist: str, target_title: str, artist: str, title: str
+    ) -> bool:
+        """Check if the given artist and title are 'good enough' match."""
+        max_dist = max(
+            string_dist(target_artist, artist),
+            string_dist(target_title, title),
+        )
+        if round(max_dist, 2) <= self.dist_thresh:
+            return True
+
+        self._log.warning(
+            "({}, {}) pair does not match ({}, {}), dist {:.2f} > expected {:.2f}",
+            artist,
+            title,
+            target_artist,
+            target_title,
+            max_dist,
+            self.dist_thresh,
+        )
+        return False
+
+
+class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
     Simply adapted from
     bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
     """
 
-    REQUIRES_BS = True
-
     base_url = "https://api.genius.com"
 
     def __init__(self, config, log):
@@ -387,19 +415,15 @@ def fetch(self, artist, title, album=None, length=None):
             self._log.debug("Genius API request returned invalid JSON")
             return None
 
-        # find a matching artist in the json
+        check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
-            hit_artist = hit["result"]["primary_artist"]["name"]
-
-            if slug(hit_artist) == slug(artist):
-                html = self.fetch_url(hit["result"]["url"])
+            result = hit["result"]
+            if check(result["primary_artist"]["name"], result["title"]):
+                html = self.fetch_url(result["url"])
                 if not html:
                     return None
                 return self._scrape_lyrics_from_html(html)
 
-        self._log.debug(
-            "Genius failed to find a matching artist for '{0}'", artist
-        )
         return None
 
     def _search(self, artist, title):
@@ -494,10 +518,8 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
         return lyrics_div.get_text()
 
 
-class Tekstowo(Backend):
+class Tekstowo(SearchBackend):
     # Fetch lyrics from Tekstowo.pl.
-    REQUIRES_BS = True
-
     BASE_URL = "http://www.tekstowo.pl"
     URL_PATTERN = BASE_URL + "/wyszukaj.html?search-title=%s&search-artist=%s"
 
@@ -563,14 +585,9 @@ def extract_lyrics(self, html, artist, title):
         if not info_elements:
             return None
 
-        html_title = info_elements[-1].get_text()
         html_artist = info_elements[-2].get_text()
-
-        title_dist = string_dist(html_title, title)
-        artist_dist = string_dist(html_artist, artist)
-
-        thresh = self.config["dist_thresh"].get(float)
-        if title_dist > thresh or artist_dist > thresh:
+        html_title = info_elements[-1].get_text()
+        if not self.check_match(artist, title, html_artist, html_title):
             return None
 
         lyrics_div = soup.select("div.song-text > div.inner-text")
@@ -649,16 +666,9 @@ def is_text_notcode(text):
         return None
 
 
-class Google(Backend):
+class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
-    REQUIRES_BS = True
-
-    def __init__(self, config, log):
-        super().__init__(config, log)
-        self.api_key = config["google_API_key"].as_str()
-        self.engine_id = config["google_engine_ID"].as_str()
-
     def is_lyrics(self, text, artist=None):
         """Determine whether the text seems to be valid lyrics."""
         if not text:
@@ -704,20 +714,21 @@ def slugify(self, text):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(self, url_link, url_title, title, artist):
+    def is_page_candidate(
+        self, artist: str, title: str, url_link: str, url_title: str
+    ) -> bool:
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title = self.slugify(title.lower())
+        title_slug = self.slugify(title.lower())
+        url_title_slug = self.slugify(url_title.lower())
+        if title_slug in url_title_slug:
+            return True
+
         artist = self.slugify(artist.lower())
         sitename = re.search(
             "//([^/]+)/.*", self.slugify(url_link.lower())
         ).group(1)
-        url_title = self.slugify(url_title.lower())
-
-        # Check if URL title contains song title (exact match)
-        if url_title.find(title) != -1:
-            return True
 
         # or try extracting song title from URL title and check if
         # they are close enough
@@ -727,18 +738,15 @@ def is_page_candidate(self, url_link, url_title, title, artist):
             + self.LYRICS_TRANS
         )
         tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
 
-        song_title = song_title.strip("_|")
-        typo_ratio = 0.9
-        ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
-        return ratio >= typo_ratio
+        return self.check_match(artist, title_slug, artist, song_title)
 
     def fetch(self, artist, title, album=None, length=None):
         query = f"{artist} {title}"
         url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s" % (
-            self.api_key,
-            self.engine_id,
+            self.config["google_API_key"].as_str(),
+            self.config["google_engine_ID"].as_str(),
             urllib.parse.quote(query.encode("utf-8")),
         )
 
@@ -755,24 +763,21 @@ def fetch(self, artist, title, album=None, length=None):
             self._log.debug("google backend error: {0}", reason)
             return None
 
-        if "items" in data.keys():
-            for item in data["items"]:
-                url_link = item["link"]
-                url_title = item.get("title", "")
-                if not self.is_page_candidate(
-                    url_link, url_title, title, artist
-                ):
-                    continue
-                html = self.fetch_url(url_link)
-                if not html:
-                    continue
-                lyrics = scrape_lyrics_from_html(html)
-                if not lyrics:
-                    continue
-
-                if self.is_lyrics(lyrics, artist):
-                    self._log.debug("got lyrics from {0}", item["displayLink"])
-                    return lyrics
+        check_candidate = partial(self.is_page_candidate, artist, title)
+        for item in data["items"]:
+            url_link = item["link"]
+            if not check_candidate(url_link, item.get("title", "")):
+                continue
+            html = self.fetch_url(url_link)
+            if not html:
+                continue
+            lyrics = scrape_lyrics_from_html(html)
+            if not lyrics:
+                continue
+
+            if self.is_lyrics(lyrics, artist):
+                self._log.debug("got lyrics from {0}", item["displayLink"])
+                return lyrics
 
         return None
 
@@ -796,6 +801,7 @@ def __init__(self):
                 "bing_client_secret": None,
                 "bing_lang_from": [],
                 "bing_lang_to": None,
+                "dist_thresh": 0.11,
                 "google_API_key": None,
                 "google_engine_ID": "009217259823014548361:lndtuqkycfu",
                 "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -807,7 +813,6 @@ def __init__(self):
                 # Musixmatch is disabled by default as they are currently blocking
                 # requests with the beets user agent.
                 "sources": [s for s in self.SOURCES if s != "musixmatch"],
-                "dist_thresh": 0.1,
             }
         )
         self.config["bing_client_secret"].redact = True
diff --git a/docs/changelog.rst b/docs/changelog.rst
index a9ed03e422..97ca95c122 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -44,6 +44,9 @@ Bug fixes:
 * :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description.
 * Remove single quotes from all SQL queries
   :bug:`4709`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+  to match lyrics when there was a slight variation in the artist name.
+  :bug:`4791`
 
 For packagers:
 
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index 4939476938..f94d3a209e 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -42,6 +42,12 @@ configuration file. The available options are:
   Default: ``[]``
 - **bing_lang_to**: Language to translate lyrics into.
   Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+  combination of the music file and lyrics candidate to consider them a match.
+  Lower values will make the plugin more strict, higher values will make it
+  more lenient. This does not apply to the ``lrclib`` backend as it matches
+  durations.
+  Default: ``0.11``.
 - **fallback**: By default, the file will be left unchanged when no lyrics are
   found. Use the empty string ``''`` to reset the lyrics in such a case.
   Default: None.
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 937e0a3cb1..79b4f6e268 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -18,7 +18,7 @@
 import os
 import re
 import unittest
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 
 import confuse
 import pytest
@@ -31,11 +31,12 @@
 from beetsplug import lyrics
 
 log = logging.getLogger("beets.test_lyrics")
-raw_backend = lyrics.Backend({}, log)
-google = lyrics.Google(MagicMock(), log)
-genius = lyrics.Genius(MagicMock(), log)
-tekstowo = lyrics.Tekstowo(MagicMock(), log)
-lrclib = lyrics.LRCLib(MagicMock(), log)
+plugin = lyrics.LyricsPlugin()
+raw_backend = lyrics.Backend(plugin.config, log)
+google = lyrics.Google(plugin.config, log)
+genius = lyrics.Genius(plugin.config, log)
+tekstowo = lyrics.Tekstowo(plugin.config, log)
+lrclib = lyrics.LRCLib(plugin.config, log)
 
 
 class LyricsPluginTest(unittest.TestCase):
@@ -244,6 +245,42 @@ def setUp(self):
             self.skipTest("Beautiful Soup 4 not available")
 
 
+class TestSearchBackend:
+    @pytest.fixture
+    def backend(self, dist_thresh):
+        plugin = lyrics.LyricsPlugin()
+        plugin.config.set({"dist_thresh": dist_thresh})
+        return lyrics.SearchBackend(plugin.config, plugin._log)
+
+    @pytest.mark.parametrize(
+        "dist_thresh, target_artist, artist, should_match",
+        [
+            (0.11, "Target Artist", "Target Artist", True),
+            (0.11, "Target Artist", "Target Artis", True),
+            (0.11, "Target Artist", "Target Arti", False),
+            (0.11, "Psychonaut", "Psychonaut (BEL)", True),
+            (0.11, "beets song", "beats song", True),
+            (0.10, "beets song", "beats song", False),
+            (
+                0.11,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                False,
+            ),
+            (
+                0.12,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                True,
+            ),
+        ],
+    )
+    def test_check_match(self, backend, target_artist, artist, should_match):
+        assert (
+            backend.check_match(target_artist, "", artist, "") == should_match
+        )
+
+
 class LyricsPluginSourcesTest(LyricsGoogleBaseTest, LyricsAssertions):
     """Check that beets google custom search engine sources are correctly
     scraped.
@@ -400,7 +437,7 @@ def test_is_page_candidate_exact_match(self):
             html, "html.parser", parse_only=SoupStrainer("title")
         )
         assert google.is_page_candidate(
-            url, soup.title.string, s["title"], s["artist"]
+            s["artist"], s["title"], url, soup.title.string
         ), url
 
     def test_is_page_candidate_fuzzy_match(self):
@@ -413,12 +450,12 @@ def test_is_page_candidate_fuzzy_match(self):
 
         # very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist
         assert google.is_page_candidate(
-            url, url_title, s["title"], s["artist"]
+            s["artist"], s["title"], url, url_title
         ), url
         # reject different title
         url_title = "example.com | seets bong lyrics by John doe"
         assert not google.is_page_candidate(
-            url, url_title, s["title"], s["artist"]
+            s["artist"], s["title"], url, url_title
         ), url
 
     def test_is_page_candidate_special_chars(self):
@@ -430,7 +467,7 @@ def test_is_page_candidate_special_chars(self):
         url = s["url"] + s["path"]
         url_title = "foo"
 
-        google.is_page_candidate(url, url_title, s["title"], "Sunn O)))")
+        google.is_page_candidate("Sunn O)))", s["title"], url, url_title)
 
 
 # test Genius backend
@@ -506,12 +543,14 @@ def test_json(self, mock_fetch_url, mock_scrape):
                                     "name": "\u200bblackbear",
                                 },
                                 "url": "blackbear_url",
+                                "title": "Idfc",
                             }
                         },
                         {
                             "result": {
                                 "primary_artist": {"name": "El\u002dp"},
                                 "url": "El-p_url",
+                                "title": "Idfc",
                             }
                         },
                     ]