Skip to content

Commit

Permalink
PS-7713 : Python 3 compatibility (#50)
Browse files Browse the repository at this point in the history
* Remove hardcoded version from Cython. Py3 compatibility in recent revisions.
* Replace implementation of shingle function.
   Old implementation blindly called next() on iterable objects without checking for StopIteration exceptions (exhaustion).
* Repinning Cython version to post Py3 compatibility fixes.
* Black formatting applied; updated version & contact info.
  • Loading branch information
scriptedworld authored Nov 19, 2019
1 parent 0353f44 commit 46de27b
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 49 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
colorama==0.3.7
coverage==4.1
Cython==0.24.1
Cython==0.29.14
nose==1.3.7
nose-timer==0.6.0
python-termstyle==0.1.10
Expand Down
66 changes: 28 additions & 38 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,60 +6,50 @@

# Complain on 32-bit systems. See README for more details
import struct
if struct.calcsize('P') < 8:
raise RuntimeError(
'Simhash-py does not work on 32-bit systems. See README.md')

ext_files = [
'simhash/simhash-cpp/src/permutation.cpp',
'simhash/simhash-cpp/src/simhash.cpp'
]
if struct.calcsize("P") < 8:
raise RuntimeError("Simhash-py does not work on 32-bit systems. See README.md")

ext_files = ["simhash/simhash-cpp/src/permutation.cpp", "simhash/simhash-cpp/src/simhash.cpp"]

kwargs = {}

try:
from Cython.Distutils import build_ext
print('Building from Cython')
ext_files.append('simhash/simhash.pyx')
kwargs['cmdclass'] = {'build_ext': build_ext}

print("Building from Cython")
ext_files.append("simhash/simhash.pyx")
kwargs["cmdclass"] = {"build_ext": build_ext}
except ImportError:
print('Buidling from C++')
ext_files.append('simhash/simhash.cpp')
print("Buidling from C++")
ext_files.append("simhash/simhash.cpp")

ext_modules = [
Extension(
'simhash.simhash', ext_files,
language='c++',
extra_compile_args=['-std=c++11'],
include_dirs=['simhash/simhash-cpp/include']
"simhash.simhash",
ext_files,
language="c++",
extra_compile_args=["-std=c++11"],
include_dirs=["simhash/simhash-cpp/include"],
)
]

setup(
name='simhash-py',
version='0.4.0',
description='Near-Duplicate Detection with Simhash',
url='http://github.com/seomoz/simhash-py',
author='Dan Lecocq',
author_email='dan@moz.com',
name="simhash-py",
version="0.4.1",
description="Near-Duplicate Detection with Simhash",
url="http://github.com/seomoz/simhash-py",
author="Moz Pro Services",
author_email="turbo@moz.com",
classifiers=[
'Programming Language :: Python',
'Intended Audience :: Developers',
'Operating System :: OS Independent',
'Topic :: Internet :: WWW/HTTP'
"Programming Language :: Python",
"Intended Audience :: Developers",
"Operating System :: OS Independent",
"Topic :: Internet :: WWW/HTTP",
],
ext_modules=ext_modules,
packages=[
'simhash'
],
package_dir={
'simhash': 'simhash'
},
tests_require=[
'coverage',
'nose',
'nose-timer',
'rednose'
],
packages=["simhash"],
package_dir={"simhash": "simhash"},
tests_require=["coverage", "nose", "nose-timer", "rednose"],
**kwargs
)
28 changes: 18 additions & 10 deletions simhash/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,22 @@


def shingle(tokens, window=4):
'''A generator for a moving window of the provided tokens.'''
"""A generator for a moving window of the provided tokens."""
if window <= 0:
raise ValueError('Window size must be positive')
its = []
for number in six_range(window):
it = iter(tokens)
its.append(it)
for _ in six_range(number):
next(it)
while True:
yield [next(it) for it in its]
raise ValueError("Window size must be positive")

# Start with an empty output set.
curr_window = []

# Iterate over the input tokens, once.
for token in tokens:
# Add to the window.
curr_window.append(token)

# If we've collected too many, remove the oldest item(s) from the collection
while len(curr_window) > window:
curr_window.pop(0)

# Finally, if the window is full, yield the data set.
if len(curr_window) == window:
yield list(curr_window)

0 comments on commit 46de27b

Please sign in to comment.