From 4028b8a41d92fe281f77799729996c32c6c55e58 Mon Sep 17 00:00:00 2001 From: automationator Date: Mon, 27 Aug 2018 09:24:49 -0400 Subject: [PATCH] Adds support for properly finding meta-refresh URLs --- urlfinderlib/urlfinderlib.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/urlfinderlib/urlfinderlib.py b/urlfinderlib/urlfinderlib.py index 5e64f72..24a2f39 100644 --- a/urlfinderlib/urlfinderlib.py +++ b/urlfinderlib/urlfinderlib.py @@ -140,6 +140,23 @@ def _recursive_tag_values(tag, values=[]): # Loop over both soups. for soup in soups: + # Find any meta-refresh URLs. + meta_urls = [] + meta_tags = soup.find_all('meta') + for meta_tag in meta_tags: + for key in meta_tag.attrs: + if key.lower() == 'content': + value = meta_tag.attrs[key] + if 'url=' in value: + split_value = value.split('url=') + url = split_value[1] + # Remove any quotes around the URL. + if url.startswith('"') and url.endswith('"'): + url = url[1:-1] + if url.startswith("'") and url.endswith("'"): + url = url[1:-1] + meta_urls.append(url) + # Hacky way to find URLs in the CSS. css_urls = re.compile(r'url\((.*?)\)').findall(str(soup)) @@ -190,6 +207,7 @@ def _recursive_tag_values(tag, values=[]): else: urls = _recursive_tag_values(soup) urls += css_urls + urls += meta_urls # As a last-ditch effort, find URLs in the visible text of the HTML. However, # we only want to add strings that are valid URLs as they are. What we do not