diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index ccc8cae75f..22f7ec4275 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -14,7 +14,6 @@ """Fetches, embeds, and displays lyrics.""" -import difflib import errno import itertools import json @@ -24,6 +23,7 @@ import unicodedata import urllib import warnings +from functools import cached_property, partial import requests from unidecode import unidecode @@ -356,15 +356,43 @@ def fetch(self, artist, title, album=None, length=None): return lyrics -class Genius(Backend): +class SearchBackend(Backend): + REQUIRES_BS = True + + @cached_property + def dist_thresh(self) -> float: + return self.config["dist_thresh"].get(float) + + def check_match( + self, target_artist: str, target_title: str, artist: str, title: str + ) -> bool: + """Check if the given artist and title are 'good enough' match.""" + max_dist = max( + string_dist(target_artist, artist), + string_dist(target_title, title), + ) + if round(max_dist, 2) <= self.dist_thresh: + return True + + self._log.warning( + "({}, {}) pair does not match ({}, {}), dist {:.2f} > expected {:.2f}", + artist, + title, + target_artist, + target_title, + max_dist, + self.dist_thresh, + ) + return False + + +class Genius(SearchBackend): """Fetch lyrics from Genius via genius-api. Simply adapted from bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ """ - REQUIRES_BS = True - base_url = "https://api.genius.com" def __init__(self, config, log): @@ -387,19 +415,15 @@ def fetch(self, artist, title, album=None, length=None): self._log.debug("Genius API request returned invalid JSON") return None - # find a matching artist in the json + check = partial(self.check_match, artist, title) for hit in json["response"]["hits"]: - hit_artist = hit["result"]["primary_artist"]["name"] - - if slug(hit_artist) == slug(artist): - html = self.fetch_url(hit["result"]["url"]) + result = hit["result"] + if check(result["primary_artist"]["name"], result["title"]): + html = self.fetch_url(result["url"]) if not html: return None return self._scrape_lyrics_from_html(html) - self._log.debug( - "Genius failed to find a matching artist for '{0}'", artist - ) return None def _search(self, artist, title): @@ -494,10 +518,8 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup): return lyrics_div.get_text() -class Tekstowo(Backend): +class Tekstowo(SearchBackend): # Fetch lyrics from Tekstowo.pl. - REQUIRES_BS = True - BASE_URL = "http://www.tekstowo.pl" URL_PATTERN = BASE_URL + "/wyszukaj.html?search-title=%s&search-artist=%s" @@ -563,14 +585,9 @@ def extract_lyrics(self, html, artist, title): if not info_elements: return None - html_title = info_elements[-1].get_text() html_artist = info_elements[-2].get_text() - - title_dist = string_dist(html_title, title) - artist_dist = string_dist(html_artist, artist) - - thresh = self.config["dist_thresh"].get(float) - if title_dist > thresh or artist_dist > thresh: + html_title = info_elements[-1].get_text() + if not self.check_match(artist, title, html_artist, html_title): return None lyrics_div = soup.select("div.song-text > div.inner-text") @@ -649,16 +666,9 @@ def is_text_notcode(text): return None -class Google(Backend): +class Google(SearchBackend): """Fetch lyrics from Google search results.""" - REQUIRES_BS = True - - def __init__(self, config, log): - super().__init__(config, log) - self.api_key = config["google_API_key"].as_str() - self.engine_id = config["google_engine_ID"].as_str() - def is_lyrics(self, text, artist=None): """Determine whether the text seems to be valid lyrics.""" if not text: @@ -704,20 +714,21 @@ def slugify(self, text): BY_TRANS = ["by", "par", "de", "von"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] - def is_page_candidate(self, url_link, url_title, title, artist): + def is_page_candidate( + self, artist: str, title: str, url_link: str, url_title: str + ) -> bool: """Return True if the URL title makes it a good candidate to be a page that contains lyrics of title by artist. """ - title = self.slugify(title.lower()) + title_slug = self.slugify(title.lower()) + url_title_slug = self.slugify(url_title.lower()) + if title_slug in url_title_slug: + return True + artist = self.slugify(artist.lower()) sitename = re.search( "//([^/]+)/.*", self.slugify(url_link.lower()) ).group(1) - url_title = self.slugify(url_title.lower()) - - # Check if URL title contains song title (exact match) - if url_title.find(title) != -1: - return True # or try extracting song title from URL title and check if # they are close enough @@ -727,18 +738,15 @@ def is_page_candidate(self, url_link, url_title, title, artist): + self.LYRICS_TRANS ) tokens = [re.escape(t) for t in tokens] - song_title = re.sub("(%s)" % "|".join(tokens), "", url_title) + song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug) - song_title = song_title.strip("_|") - typo_ratio = 0.9 - ratio = difflib.SequenceMatcher(None, song_title, title).ratio() - return ratio >= typo_ratio + return self.check_match(artist, title_slug, artist, song_title) def fetch(self, artist, title, album=None, length=None): query = f"{artist} {title}" url = "https://www.googleapis.com/customsearch/v1?key=%s&cx=%s&q=%s" % ( - self.api_key, - self.engine_id, + self.config["google_API_key"].as_str(), + self.config["google_engine_ID"].as_str(), urllib.parse.quote(query.encode("utf-8")), ) @@ -755,24 +763,21 @@ def fetch(self, artist, title, album=None, length=None): self._log.debug("google backend error: {0}", reason) return None - if "items" in data.keys(): - for item in data["items"]: - url_link = item["link"] - url_title = item.get("title", "") - if not self.is_page_candidate( - url_link, url_title, title, artist - ): - continue - html = self.fetch_url(url_link) - if not html: - continue - lyrics = scrape_lyrics_from_html(html) - if not lyrics: - continue - - if self.is_lyrics(lyrics, artist): - self._log.debug("got lyrics from {0}", item["displayLink"]) - return lyrics + check_candidate = partial(self.is_page_candidate, artist, title) + for item in data["items"]: + url_link = item["link"] + if not check_candidate(url_link, item.get("title", "")): + continue + html = self.fetch_url(url_link) + if not html: + continue + lyrics = scrape_lyrics_from_html(html) + if not lyrics: + continue + + if self.is_lyrics(lyrics, artist): + self._log.debug("got lyrics from {0}", item["displayLink"]) + return lyrics return None @@ -796,6 +801,7 @@ def __init__(self): "bing_client_secret": None, "bing_lang_from": [], "bing_lang_to": None, + "dist_thresh": 0.11, "google_API_key": None, "google_engine_ID": "009217259823014548361:lndtuqkycfu", "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W" @@ -807,7 +813,6 @@ def __init__(self): # Musixmatch is disabled by default as they are currently blocking # requests with the beets user agent. "sources": [s for s in self.SOURCES if s != "musixmatch"], - "dist_thresh": 0.1, } ) self.config["bing_client_secret"].redact = True diff --git a/docs/changelog.rst b/docs/changelog.rst index a9ed03e422..97ca95c122 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -44,6 +44,9 @@ Bug fixes: * :doc:`plugins/discogs`: Fix the ``TypeError`` when there is no description. * Remove single quotes from all SQL queries :bug:`4709` +* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able + to match lyrics when there was a slight variation in the artist name. + :bug:`4791` For packagers: diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index 4939476938..f94d3a209e 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -42,6 +42,12 @@ configuration file. The available options are: Default: ``[]`` - **bing_lang_to**: Language to translate lyrics into. Default: None. +- **dist_thresh**: The maximum distance between the artist and title + combination of the music file and lyrics candidate to consider them a match. + Lower values will make the plugin more strict, higher values will make it + more lenient. This does not apply to the ``lrclib`` backend as it matches + durations. + Default: ``0.11``. - **fallback**: By default, the file will be left unchanged when no lyrics are found. Use the empty string ``''`` to reset the lyrics in such a case. Default: None. diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 937e0a3cb1..79b4f6e268 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -18,7 +18,7 @@ import os import re import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import patch import confuse import pytest @@ -31,11 +31,12 @@ from beetsplug import lyrics log = logging.getLogger("beets.test_lyrics") -raw_backend = lyrics.Backend({}, log) -google = lyrics.Google(MagicMock(), log) -genius = lyrics.Genius(MagicMock(), log) -tekstowo = lyrics.Tekstowo(MagicMock(), log) -lrclib = lyrics.LRCLib(MagicMock(), log) +plugin = lyrics.LyricsPlugin() +raw_backend = lyrics.Backend(plugin.config, log) +google = lyrics.Google(plugin.config, log) +genius = lyrics.Genius(plugin.config, log) +tekstowo = lyrics.Tekstowo(plugin.config, log) +lrclib = lyrics.LRCLib(plugin.config, log) class LyricsPluginTest(unittest.TestCase): @@ -244,6 +245,42 @@ def setUp(self): self.skipTest("Beautiful Soup 4 not available") +class TestSearchBackend: + @pytest.fixture + def backend(self, dist_thresh): + plugin = lyrics.LyricsPlugin() + plugin.config.set({"dist_thresh": dist_thresh}) + return lyrics.SearchBackend(plugin.config, plugin._log) + + @pytest.mark.parametrize( + "dist_thresh, target_artist, artist, should_match", + [ + (0.11, "Target Artist", "Target Artist", True), + (0.11, "Target Artist", "Target Artis", True), + (0.11, "Target Artist", "Target Arti", False), + (0.11, "Psychonaut", "Psychonaut (BEL)", True), + (0.11, "beets song", "beats song", True), + (0.10, "beets song", "beats song", False), + ( + 0.11, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + False, + ), + ( + 0.12, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + True, + ), + ], + ) + def test_check_match(self, backend, target_artist, artist, should_match): + assert ( + backend.check_match(target_artist, "", artist, "") == should_match + ) + + class LyricsPluginSourcesTest(LyricsGoogleBaseTest, LyricsAssertions): """Check that beets google custom search engine sources are correctly scraped. @@ -400,7 +437,7 @@ def test_is_page_candidate_exact_match(self): html, "html.parser", parse_only=SoupStrainer("title") ) assert google.is_page_candidate( - url, soup.title.string, s["title"], s["artist"] + s["artist"], s["title"], url, soup.title.string ), url def test_is_page_candidate_fuzzy_match(self): @@ -413,12 +450,12 @@ def test_is_page_candidate_fuzzy_match(self): # very small diffs (typo) are ok eg 'beats' vs 'beets' with same artist assert google.is_page_candidate( - url, url_title, s["title"], s["artist"] + s["artist"], s["title"], url, url_title ), url # reject different title url_title = "example.com | seets bong lyrics by John doe" assert not google.is_page_candidate( - url, url_title, s["title"], s["artist"] + s["artist"], s["title"], url, url_title ), url def test_is_page_candidate_special_chars(self): @@ -430,7 +467,7 @@ def test_is_page_candidate_special_chars(self): url = s["url"] + s["path"] url_title = "foo" - google.is_page_candidate(url, url_title, s["title"], "Sunn O)))") + google.is_page_candidate("Sunn O)))", s["title"], url, url_title) # test Genius backend @@ -506,12 +543,14 @@ def test_json(self, mock_fetch_url, mock_scrape): "name": "\u200bblackbear", }, "url": "blackbear_url", + "title": "Idfc", } }, { "result": { "primary_artist": {"name": "El\u002dp"}, "url": "El-p_url", + "title": "Idfc", } }, ]