diff --git a/core/utils.py b/core/utils.py index 905d390..c445494 100644 --- a/core/utils.py +++ b/core/utils.py @@ -139,12 +139,10 @@ def extract_headers(headers): def top_level(url, fix_protocol=True): """Extract the top level domain from an URL.""" - ext = tld.get_tld(url, fix_protocol=fix_protocol) - toplevel = '.'.join(urlparse(url).netloc.split('.')[-2:]).split( - ext)[0] + ext + res = tld.get_tld(url, fix_protocol=fix_protocol, as_object=True) + toplevel = res.domain + '.' + res.tld return toplevel - def is_proxy_list(v, proxies): if os.path.isfile(v): with open(v, 'r') as _file: diff --git a/test_utils.py b/test_utils.py new file mode 100644 index 0000000..5e5f6f1 --- /dev/null +++ b/test_utils.py @@ -0,0 +1,19 @@ +import pytest +import tld.exceptions +from core.utils import top_level + +def test_top_level_with_https_url(): + assert top_level('https://google.co.uk') == 'google.co.uk' + assert top_level('https://google.com') == 'google.com' + +def test_top_level_with_one_level_domain(): + assert top_level('google.co.uk') == 'google.co.uk' + assert top_level('google.com') == 'google.com' + +def test_top_level_with_second_level_domain(): + assert top_level('123.google.co.uk') == 'google.co.uk' + assert top_level('123.google.com') == 'google.com' + +def test_top_level_with_wrong_domain(): + with pytest.raises(tld.exceptions.TldDomainNotFound): + top_level('google.co.uk2')