From 46de27b310022e3ac7a856c5fc4f4b0df1d76af7 Mon Sep 17 00:00:00 2001 From: J W Date: Tue, 19 Nov 2019 15:57:35 -0800 Subject: [PATCH] PS-7713 : Python 3 compatibility (#50) * Remove hardcoded version from Cython. Py3 compatibility in recent revisions. * Replace implementation of shingle function. Old implementation blindly called next() on iterable objects without checking for StopIteration exceptions (exhaustion). * Repinning Cython version to post Py3 compatibility fixes. * Black formatting applied; updated version & contact info. --- requirements.txt | 2 +- setup.py | 66 +++++++++++++++++++-------------------------- simhash/__init__.py | 28 ++++++++++++------- 3 files changed, 47 insertions(+), 49 deletions(-) diff --git a/requirements.txt b/requirements.txt index 916a1cb..2bb7ac1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ colorama==0.3.7 coverage==4.1 -Cython==0.24.1 +Cython==0.29.14 nose==1.3.7 nose-timer==0.6.0 python-termstyle==0.1.10 diff --git a/setup.py b/setup.py index 11b1f97..85dfb7e 100755 --- a/setup.py +++ b/setup.py @@ -6,60 +6,50 @@ # Complain on 32-bit systems. See README for more details import struct -if struct.calcsize('P') < 8: - raise RuntimeError( - 'Simhash-py does not work on 32-bit systems. See README.md') -ext_files = [ - 'simhash/simhash-cpp/src/permutation.cpp', - 'simhash/simhash-cpp/src/simhash.cpp' -] +if struct.calcsize("P") < 8: + raise RuntimeError("Simhash-py does not work on 32-bit systems. See README.md") + +ext_files = ["simhash/simhash-cpp/src/permutation.cpp", "simhash/simhash-cpp/src/simhash.cpp"] kwargs = {} try: from Cython.Distutils import build_ext - print('Building from Cython') - ext_files.append('simhash/simhash.pyx') - kwargs['cmdclass'] = {'build_ext': build_ext} + + print("Building from Cython") + ext_files.append("simhash/simhash.pyx") + kwargs["cmdclass"] = {"build_ext": build_ext} except ImportError: - print('Buidling from C++') - ext_files.append('simhash/simhash.cpp') + print("Buidling from C++") + ext_files.append("simhash/simhash.cpp") ext_modules = [ Extension( - 'simhash.simhash', ext_files, - language='c++', - extra_compile_args=['-std=c++11'], - include_dirs=['simhash/simhash-cpp/include'] + "simhash.simhash", + ext_files, + language="c++", + extra_compile_args=["-std=c++11"], + include_dirs=["simhash/simhash-cpp/include"], ) ] setup( - name='simhash-py', - version='0.4.0', - description='Near-Duplicate Detection with Simhash', - url='http://github.com/seomoz/simhash-py', - author='Dan Lecocq', - author_email='dan@moz.com', + name="simhash-py", + version="0.4.1", + description="Near-Duplicate Detection with Simhash", + url="http://github.com/seomoz/simhash-py", + author="Moz Pro Services", + author_email="turbo@moz.com", classifiers=[ - 'Programming Language :: Python', - 'Intended Audience :: Developers', - 'Operating System :: OS Independent', - 'Topic :: Internet :: WWW/HTTP' + "Programming Language :: Python", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Topic :: Internet :: WWW/HTTP", ], ext_modules=ext_modules, - packages=[ - 'simhash' - ], - package_dir={ - 'simhash': 'simhash' - }, - tests_require=[ - 'coverage', - 'nose', - 'nose-timer', - 'rednose' - ], + packages=["simhash"], + package_dir={"simhash": "simhash"}, + tests_require=["coverage", "nose", "nose-timer", "rednose"], **kwargs ) diff --git a/simhash/__init__.py b/simhash/__init__.py index 2bf0e90..9cc1bc6 100644 --- a/simhash/__init__.py +++ b/simhash/__init__.py @@ -5,14 +5,22 @@ def shingle(tokens, window=4): - '''A generator for a moving window of the provided tokens.''' + """A generator for a moving window of the provided tokens.""" if window <= 0: - raise ValueError('Window size must be positive') - its = [] - for number in six_range(window): - it = iter(tokens) - its.append(it) - for _ in six_range(number): - next(it) - while True: - yield [next(it) for it in its] + raise ValueError("Window size must be positive") + + # Start with an empty output set. + curr_window = [] + + # Iterate over the input tokens, once. + for token in tokens: + # Add to the window. + curr_window.append(token) + + # If we've collected too many, remove the oldest item(s) from the collection + while len(curr_window) > window: + curr_window.pop(0) + + # Finally, if the window is full, yield the data set. + if len(curr_window) == window: + yield list(curr_window)