Skip to content

Commit

Permalink
Use new urlpy fork instead of url.py #1833 #2021
Browse files Browse the repository at this point in the history
The latest url.py is written in C++
We do not need that speed but we need portability and pure
python. The new urlpy is a fork of url.py v.2.0 that has been ported to
Python 3 and works on 2 and 3.

Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Apr 28, 2020
1 parent 1d7f0ba commit f1ee1e3
Show file tree
Hide file tree
Showing 18 changed files with 78 additions and 102 deletions.
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,9 @@ def read(*names, **kwargs):
# cluecode
# Some nltk version ranges are buggy
'nltk >= 3.2, < 4.0',
'publicsuffix2',
'py2_ipaddress >= 2.0, <3.5; python_version<"3"',
'url >= 0.1.4, < 0.1.6; python_version<"3"',
'url >= 0.4.2, < 1.0.0; python_version>="3"',
'urlpy',
'publicsuffix2',
'fingerprints >= 0.6.0, < 1.0.0',

# extractcode
Expand Down
34 changes: 10 additions & 24 deletions src/cluecode/finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,9 @@

import ipaddress
from six import string_types
import url as urlpy
import urlpy

from commoncode import compat
from commoncode.system import py2
from commoncode.system import py3
from commoncode.text import toascii
from cluecode import finder_data
Expand Down Expand Up @@ -83,7 +82,7 @@ def find(location, patterns):
loc = pformat(location)
logger_debug('find(location=%(loc)r,\n patterns=%(patterns)r)' % locals())

for lineno, line in analysis.numbered_text_lines(location):
for lineno, line in analysis.numbered_text_lines(location, demarkup=False):
for key, pattern in patterns:
for match in pattern.findall(line):

Expand Down Expand Up @@ -385,14 +384,6 @@ def canonical_url(uri):
parsed = urlpy.parse(uri)
if not parsed:
return

if py2:
if not hasattr(parsed, '_scheme') or not hasattr(parsed, '_host'):
raise Exception('a')
return
else:
if not hasattr(parsed, 'scheme') or not hasattr(parsed, 'host'):
raise Exception('b')
if TRACE:
logger_debug('canonical_url: parsed:', parsed)

Expand All @@ -406,17 +397,12 @@ def canonical_url(uri):
if TRACE:
logger_debug('canonical_url: punycoded:', punycoded)

if py2:
if punycoded._port == DEFAULT_PORTS.get(punycoded._scheme):
punycoded._port = None
else:
if punycoded.port == DEFAULT_PORTS.get(punycoded.scheme):
punycoded.port = 0
if py2:
decoded = punycoded.utf8()
else:
decoded = punycoded.utf8
return decoded.decode('utf-8')
deport = punycoded.remove_default_port()

if TRACE:
logger_debug('canonical_url: deport:', deport)

return str(sanitized)
except Exception as e:
if TRACE:
logger_debug('canonical_url: failed for:', uri, 'with:', repr(e))
Expand Down Expand Up @@ -539,10 +525,10 @@ def url_host_domain(url):
"""
try:
parsed = urlpy.parse(url)
host = parsed.host if py3 else parsed._host
host = parsed.host
if not host:
return None, None
domain = parsed.pld if py3 else parsed.pld()
domain = parsed.pld
return host.lower(), domain.lower()
except Exception as e:
if TRACE:
Expand Down
1 change: 0 additions & 1 deletion src/licensedcode/data/licenses/d-fsl-1.0-en.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,5 @@ ignorable_holders:
- Ministry of Science and Research, State of North-Rhine Westphalia
ignorable_urls:
- http://www.d-fsl.org/
- http://www.d/
- http://www.fsf.org/licenses/gpl
- http://www.ifross.de/
4 changes: 2 additions & 2 deletions src/licensedcode/data/rules/ekioh_mit_like2.RULE
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
Licenced as X11: http://www.kry ogenix.org/code/browser/licence.html
This basically means: do what you want with it.
Licenced as X11: http://www.kryogenix.org/code/browser/licence.html
This basically means: do what you want with it.
2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/ekioh_mit_like2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ license_expression: ekioh
is_license_notice: yes
notes: this text is an MIT without a condition, only a disclaimer from https://kryogenix.org/code/browser/licence.html
ignorable_urls:
- http://www.kry/
- http://www.kryogenix.org/code/browser/licence.html

2 changes: 1 addition & 1 deletion src/licensedcode/data/rules/sgi-freeb-2.0_6.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ ignorable_holders:
- dates of first publication Silicon Graphics, Inc.
ignorable_urls:
- http://oss.sgi.com/projects/FreeB
- http://oss.sgi.com/projects/FreeB/&quot;http://oss.sgi.com/projects/FreeB/
- http://oss.sgi.com/projects/FreeB/&quot;http:/oss.sgi.com/projects/FreeB/
13 changes: 11 additions & 2 deletions tests/cluecode/test_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,12 +693,21 @@ def test_misc_invalid_urls_that_are_still_detected_and_normalized(self):
# set of non URLs from https://mathiasbynens.be/demo/url-regex
urls = u'''
http://www.foo.bar./
'''
for test in urls.split():
result = [val for val, _ln in finder.find_urls([test])]
assert [test] == result

@pytest.mark.skipif(not py3, reason='url-cpp behaves differently')
def test_invalid_urls_are_not_detected(self):
# set of non URLs from https://mathiasbynens.be/demo/url-regex
urls = u'''
http://1.1.1.1.1
http://-error-.invalid/
'''
for test in urls.split():
result = [val.replace('.', '') for val, _ln in finder.find_urls([test])]
assert result in ([test.replace('.', '')] , [test.replace('.', '') + u'/'])
result = [val for val, _ln in finder.find_urls([test])]
assert [] == result

def test_misc_invalid_urls_that_should_not_be_detected(self):
# At least per this set of non URLs from https://mathiasbynens.be/demo/url-regex
Expand Down
Binary file not shown.
31 changes: 0 additions & 31 deletions thirdparty/publicsuffix2-2.20180921-py2.py3-none-any.whl.ABOUT

This file was deleted.

Binary file not shown.
32 changes: 32 additions & 0 deletions thirdparty/publicsuffix2-2.20191221-py2.py3-none-any.whl.ABOUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
about_resource: publicsuffix2-2.20191221-py2.py3-none-any.whl
name: publicsuffix2
version: '2.20191221'
download_url: https://files.pythonhosted.org/packages/9d/16/053c2945c5e3aebeefb4ccd5c5e7639e38bc30ad1bdc7ce86c6d01707726/publicsuffix2-2.20191221-py2.py3-none-any.whl
description: A fork of publicsuffix with updated data as "package data" in a wheel friendly
format
homepage_url: https://github.com/nexB/python-publicsuffix2
license_expression: mit AND mpl-2.0
copyright: |
Copyright (c) nexB Inc.
Copyright (c) Tomaž Šolc
Copyright (c) David Wilson
Copyright (c) Mozilla
notice_file: publicsuffix2-2.20191221-py2.py3-none-any.whl.NOTICE
notice_url: https://github.com/nexB/python-publicsuffix2/blob/develop/publicsuffix2.LICENSE
redistribute: yes
attribute: yes
track_changes: yes
owner: nexB
owner_url: http://www.nexb.com/
contact: http://www.nexb.com/contactus.html
vcs_repository: git+https://github.com/nexB/python-publicsuffix2.git
checksum_md5: eb5b7bd06270ca4c90352541831cea58
checksum_sha1: dacd55374dc527e0d01986d24cd143acdf292dcc
package_url: pkg:pypi/[email protected]
licenses:
- key: mit
name: MIT License
file: mit.LICENSE
- key: mpl-2.0
name: Mozilla Public License 2.0
file: mpl-2.0.LICENSE
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Copyright (c) 2015 nexB Inc.
This code is based on Tomaž Šolc fork of David Wilson code originally at
Copyright (c) nexB Inc.
This code is based on Tomaz Solc fork of David Wilson code originally at
https://www.tablix.org/~avian/git/publicsuffix.git

Copyright (c) 2014 Tomaž Šolc <[email protected]>
Copyright (c) 2014 Tomaz Solc <[email protected]>

Python module included in this distribution is based on the code downloaded
from http://code.google.com/p/python-public-suffix-list/, which is
Expand Down Expand Up @@ -32,4 +32,4 @@ DEALINGS IN THE SOFTWARE.
The Public Suffix List vendored in this distribution has been downloaded
from http://publicsuffix.org/public_suffix_list.dat
This data file is licensed under the MPL-2.0 license.
http://mozilla.org/MPL/2.0/
http://mozilla.org/MPL/2.0/
Binary file removed thirdparty/url-0.1.4.5-py2-none-any.whl
Binary file not shown.
Binary file removed thirdparty/url-0.1.4.5-py3-none-any.whl
Binary file not shown.
13 changes: 0 additions & 13 deletions thirdparty/url.ABOUT

This file was deleted.

20 changes: 0 additions & 20 deletions thirdparty/url.LICENSE

This file was deleted.

Binary file added thirdparty/urlpy-0.5-py2.py3-none-any.whl
Binary file not shown.
15 changes: 15 additions & 0 deletions thirdparty/urlpy-0.5-py2.py3-none-any.whl.ABOUT
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
about_resource: urlpy-0.5-py2.py3-none-any.whl
download_url: https://files.pythonhosted.org/packages/23/f0/43a8013e888f435c619f82b485ef8cf9fddfcceea7806d824b28d5ef8f76/urlpy-0.5-py2.py3-none-any.whl
homepage_url: https://github.com/nexB/urlpy
license_expression: mit
copyright: |
Copyright (c) Moz Inc.
Copyright (c) nexB Inc.
attribute: yes
checksum_md5: 91eeb2f03ab9a7d91f9d110223f7cc55
checksum_sha1: c38d10f349c612fdbaee632b298a6883e5fc75cf
package_url: pkg:pypi/[email protected]
licenses:
- key: mit
name: MIT License
file: mit.LICENSE

0 comments on commit f1ee1e3

Please sign in to comment.