diff --git a/.gitignore b/.gitignore index 84e60beb..34cbb31c 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,6 @@ tests/gif/standardized_text.txt tests/jpg/standardized_text.txt tests/tiff/standardized_text.txt tests/pdf/ocr_text.txt + +# PyCharm +.idea/ diff --git a/.travis.yml b/.travis.yml index 6b9c5caf..973006a2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,25 +3,18 @@ os: linux language: python python: - - "2.7" - "3.7" # install system dependencies here with apt-get. before_install: - sudo ./provision/debian.sh - - python -m pip install --upgrade pip + - python -m pip install --upgrade pip setuptools wheel # install python dependencies including this package in the travis # virtualenv install: - - - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; - then ./provision/python3.sh; - fi - - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; - then ./provision/python2.sh; - fi - - pip install .[pocketsphinx] + - ./provision/python.sh + - pip install . # commands to run the testing suite. if any of these fail, travic lets us know script: @@ -29,9 +22,7 @@ script: - nosetests --with-coverage --cover-package=textract - cd tests && pytest && cd - # - pycodestyle textract/ bin/textract - - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; - then cd docs && make html && cd -; - fi + - cd docs && make html && cd -; # commands to run after the tests successfully complete after_success: diff --git a/Vagrantfile b/Vagrantfile index 93bcb20e..6eb3bbca 100644 --- a/Vagrantfile +++ b/Vagrantfile @@ -27,8 +27,7 @@ Vagrant.configure("2") do |config| vb.customize ["modifyvm", :id, "--ioapic", "on"] vb.customize ["modifyvm", :id, "--cpus", "2"] vb.customize ["modifyvm", :id, "--memory", "2048"] - override_config.vm.box = "trusty64" - override_config.vm.box_url = "https://cloud-images.ubuntu.com/vagrant/trusty/current/trusty-server-cloudimg-amd64-vagrant-disk1.box" + override_config.vm.box = "ubuntu/focal64" end # steps for provisioning so that these provisioning steps are diff --git a/bin/textract b/bin/textract old mode 100644 new mode 100755 diff --git a/docs/changelog.rst b/docs/changelog.rst index cb81e431..b2e6a5a8 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -10,6 +10,11 @@ latest changes in development for next release ---------------------------------------------- .. THANKS FOR CONTRIBUTING; ADD YOUR UNRELEASED CHANGES HERE! +1.7.0 +------------------- + +* Dropped python2 support + 1.6.5 ------------------- diff --git a/docs/conf.py b/docs/conf.py index a4a0b521..d2ab6161 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,7 +58,7 @@ # built documents. # # The short X.Y version. -release = version = "1.6.5" +release = version = "1.7.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/index.rst b/docs/index.rst index ee77e26f..a12ea852 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -86,7 +86,7 @@ file types by either mentioning them on the `issue tracker * ``.wav`` via `SpeechRecognition`_ and `pocketsphinx`_ -* ``.xlsx`` via `xlrd `_ +* ``.xlsx`` via `openpyxl `_ * ``.xls`` via `xlrd `_ diff --git a/provision/python3.sh b/provision/python.sh similarity index 92% rename from provision/python3.sh rename to provision/python.sh index 3c8f913d..b22bdd8e 100755 --- a/provision/python3.sh +++ b/provision/python.sh @@ -12,5 +12,5 @@ fi pip install -U pip # Install the requirements for this package as well as this module. -pip install -r requirements/python-dev3 +pip install -r requirements/python-dev pip install -r requirements/python-doc diff --git a/provision/python2.sh b/provision/python2.sh deleted file mode 100755 index d960b630..00000000 --- a/provision/python2.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -# This needs to work for vagrant, Travis builds, and Docker builds. -# in a python virtualenv. in the virtual machine provisioning, -# we're passing the directory this should be run from. in travis-ci, -# its run from the root of the repository. -if [ "$#" -eq 1 ]; then - cd $1 -fi - -# upgrade pip so we can use wheel downloads -pip install -U pip - -# Install the requirements for this package as well as this module. -pip install -r requirements/python-dev2 diff --git a/provision/travis-mock.sh b/provision/travis-mock.sh index 3f98cbed..64d689c9 100755 --- a/provision/travis-mock.sh +++ b/provision/travis-mock.sh @@ -6,7 +6,7 @@ # if its a problem. # http://docs.travis-ci.com/user/languages/python/#Travis-CI-Uses-Isolated-virtualenvs sudo apt-get update -qq -sudo apt-get install -y python-pip python-dev build-essential +sudo apt-get install -y python3-pip python3-dev build-essential # install pep8 and nose for testing sudo pip install pep8 nose diff --git a/requirements/debian b/requirements/debian index 14026fef..ec257150 100644 --- a/requirements/debian +++ b/requirements/debian @@ -9,7 +9,7 @@ make # these packages are required by python-docx, which depends on lxml # and requires these things -python-dev +python3-dev libxml2-dev libxslt1-dev @@ -48,3 +48,5 @@ swig # libxslt1-dev for compiling lxml. # https://github.com/deanmalmgren/textract/issues/19 zlib1g-dev + +python-is-python3 \ No newline at end of file diff --git a/requirements/python b/requirements/python index 1c200697..4e1f020e 100644 --- a/requirements/python +++ b/requirements/python @@ -1,13 +1,14 @@ # This file contains all python dependencies that are required by the textract # package in order for it to properly work. -argcomplete~=1.10.0 -beautifulsoup4~=4.8.0 -chardet==3.* -docx2txt~=0.8 -extract-msg<=0.29.* #Last with python2 support -pdfminer.six==20191110 #Last with python2 support -python-pptx~=0.6.18 -six~=1.12.0 -SpeechRecognition~=3.8.1 -xlrd~=1.2.0 +argcomplete>=1.10.0 +beautifulsoup4>=4.8.0 +chardet>=3.0 +docx2txt>=0.8 +extract-msg>=0.29.0 +pdfminer.six>=20191110 +python-pptx>=0.6.18 +six>=1.12.0 +SpeechRecognition>=3.8.1 +xlrd>=1.2.0 +openpyxl>=2.0.0 diff --git a/requirements/python-dev3 b/requirements/python-dev similarity index 100% rename from requirements/python-dev3 rename to requirements/python-dev diff --git a/requirements/python-dev2 b/requirements/python-dev2 deleted file mode 100644 index 8965d435..00000000 --- a/requirements/python-dev2 +++ /dev/null @@ -1,16 +0,0 @@ -# This includes all packages that are used in development, including all -# packages that are required by textract itself (python), packages for -# documentation builds (python-doc) - --r python - -# needed for tests/run.py script to read .travis.yml file -coveralls==1.8.2 -nose==1.3.7 -pycodestyle==2.5.0 -PyYAML==5.1.1 -requests==2.22.0 -pytest==4.6 - -# needed for managing versions -bumpversion==0.5.3 diff --git a/requirements/python-doc b/requirements/python-doc index d8cd533c..3b9752d8 100644 --- a/requirements/python-doc +++ b/requirements/python-doc @@ -1,5 +1,7 @@ # this only includes packages that are needed for documentation build. +jinja2<3.1 sphinx==2.1.2 sphinx_rtd_theme==0.4.3 sphinx-argparse==0.2.5 +pocketsphinx==0.1.15 diff --git a/setup.cfg b/setup.cfg index 7d0b8f72..16b9cafd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.6.5 +current_version = 1.7.0 commit = True tag = True diff --git a/setup.py b/setup.py index b9834138..c8495530 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def parse_requirements(requirements_filename): setup( name=textract.__name__, - version="1.6.5", + version="1.7.0", description="extract text from any document. no muss. no fuss.", long_description=long_description, url=github_url, diff --git a/tests/Dockerfile b/tests/Dockerfile index 9eb31277..3d765a90 100644 --- a/tests/Dockerfile +++ b/tests/Dockerfile @@ -1,16 +1,15 @@ -FROM ubuntu:12.04 +FROM ubuntu:20.04 MAINTAINER Shawn Milochik ENV DEBIAN_FRONTEND noninteractive -ENV REFRESHED_AT 2014-08-12b +ENV REFRESHED_AT 2022-08-17 RUN apt-get update -RUN apt-get install python-pip -y -ADD . /src -WORKDIR /src -RUN /bin/bash /src/provision/debian.sh -RUN /bin/bash /src/provision/python.sh +RUN apt-get install python3-pip -y +ADD . /app +WORKDIR /app +RUN /bin/bash /app/provision/debian.sh +RUN /bin/bash /app/provision/python.sh RUN adduser --disabled-password --gecos "" --home=/home/textract textract -VOLUME ["/home/textract/src"] ENV PATH $PATH:/home/textract/src/bin ENV PYTHONPATH /home/textract/src USER textract -ENTRYPOINT ["/home/textract/src/tests/run.py"] +ENTRYPOINT ["/home/textract/src/tests/docker_entry.sh"] diff --git a/tests/docker_entry.sh b/tests/docker_entry.sh index 7398841c..18609572 100755 --- a/tests/docker_entry.sh +++ b/tests/docker_entry.sh @@ -3,4 +3,4 @@ # This script gets called from within the # Docker container. -./tests/run.py +cd "$(dirname "$0")" && make && pytest && cd - diff --git a/tests/pdf/raw_text-m=pdfminer.txt b/tests/pdf/raw_text-m=pdfminer.txt index 0e200c5e..8226e33e 100644 --- a/tests/pdf/raw_text-m=pdfminer.txt +++ b/tests/pdf/raw_text-m=pdfminer.txt @@ -1,59 +1,91 @@ I  love  word  documents.  They  are  lovely.  They  make  me  so  happy  I  could  smile.  And   that’s  why  I  wrote  this  package.   -   Sample text is hard. That’s where http://hipsum.co comes in handy. -   - Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin + pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer + PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh + ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church- + key locavore beard, food truck chillwave sartorial deep v flannel authentic + Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage + Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel + keytar Portland post-ironic. Cred hoodie vegan, food truck leggings + Austin pour-over banjo trust fund before they sold out cray Intelligentsia + plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu. Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn + biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, + viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter + YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag + meh. Thundercats semiotics shabby chic forage single-origin coffee retro, + 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard + gluten-free seitan, VHS sartorial pork belly gastropub meh whatever + authentic synth. Beard single-origin coffee irony fixie, before they sold +   +   out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan + hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. Meh jean shorts selfies, crucifix selvage Helvetica Carles PBR Vice + Banksy roof party master cleanse ugh PBR&B. Lo-fi freegan salvia photo + booth, Wes Anderson skateboard Odd Future. Etsy art party Bushwick + keffiyeh. Pork belly 3 wolf moon butcher mustache. YOLO raw denim lo- + fi, hoodie gentrify Schlitz 8-bit sriracha Shoreditch retro brunch. + Williamsburg farm-to-table beard, mlkshk Banksy fap kogi Etsy art party + squid semiotics. XOXO church-key Pitchfork mlkshk irony tote bag. Farm-to-table brunch tattooed hoodie keytar, literally selvage authentic + trust fund deep v Thundercats Kickstarter narwhal locavore. Swag disrupt + chambray, leggings shabby chic gastropub YOLO plaid hoodie + Williamsburg Godard mixtape. Retro Godard keytar biodiesel, freegan + paleo Etsy you probably haven't heard of them Pitchfork Schlitz + readymade small batch cred. Pug trust fund paleo, 90's fixie typewriter + next level banjo. Banksy occupy authentic master cleanse Bushwick + fingerstache selfies, direct trade craft beer cliche +1 cray. Locavore four + loko biodiesel Neutra chia mlkshk. Fanny pack YOLO Portland, mlkshk + PBR&B single-origin coffee drinking vinegar 8-bit flannel gentrify + stumptown pop-up. + Oh. You need a little dummy text for your mockup? How quaint. I bet you’re still using Bootstrap too…   - \ No newline at end of file diff --git a/tests/run_docker_tests.sh b/tests/run_docker_tests.sh index 958b6d44..826f6db6 100755 --- a/tests/run_docker_tests.sh +++ b/tests/run_docker_tests.sh @@ -5,15 +5,10 @@ cd $(dirname $0)/.. base=$(pwd) -image="textract/ubuntu12.04" - -cp tests/Dockerfile ./Dockerfile +image="textract/ubuntu20.04" # Note: For speed, the image won't be automatically rebuilt. If the dependencies # change and the existing image is outdated, just delete it with: # docker rmi -docker images | grep $image || docker build -t $image . +docker images | grep $image || docker build -t $image -f tests/Dockerfile . docker run --rm -v $base:/home/textract/src $image - -rm ./Dockerfile - diff --git a/tests/xlsx/raw_text.txt b/tests/xlsx/raw_text.txt index 16925a26..42afafaf 100644 --- a/tests/xlsx/raw_text.txt +++ b/tests/xlsx/raw_text.txt @@ -1,17 +1,17 @@ Category Function Description New? Help Topic Help -Logical IFERROR Returns a different result if the first argument evaluates to an error 1 HA01231765 -Statistical AVERAGEIF Returns the average for the cells specified by a given criterion 1 HA10047433 -Statistical AVERAGEIFS Returns the average for the cells specified by multiple criteria 1 HA10047493 -Statistical COUNTIFS Counts the number of cells that meet multiple criteria 1 HA10047494 -Math & Trig SUMIFS Adds the cells specified by a multiple criteria 1 HA10047504 -Cube CUBEMEMBER Returns a member or tuple in a cube hierarchy 1 HA10083017 -Cube CUBEVALUE Returns an aggregated value from a cube 1 HA10083018 -Cube CUBESET Defines a calculated set of members or tuples by sending a set expression to the cube on the server 1 HA10083019 -Cube CUBERANKEDMEMBER Returns the nth, or ranked, member in a set 1 HA10083020 -Cube CUBEKPIMEMBER Returns a key performance indicator name, property, and measure, and displays the name and property in the cell. 1 HA10083021 -Cube CUBEMEMBERPROPERTY Returns the value of a member property in the cube 1 HA10083023 -Cube CUBESETCOUNT Returns the number of items in a set 1 HA10083024 +Logical IFERROR Returns a different result if the first argument evaluates to an error True HA01231765 +Statistical AVERAGEIF Returns the average for the cells specified by a given criterion True HA10047433 +Statistical AVERAGEIFS Returns the average for the cells specified by multiple criteria True HA10047493 +Statistical COUNTIFS Counts the number of cells that meet multiple criteria True HA10047494 +Math & Trig SUMIFS Adds the cells specified by a multiple criteria True HA10047504 +Cube CUBEMEMBER Returns a member or tuple in a cube hierarchy True HA10083017 +Cube CUBEVALUE Returns an aggregated value from a cube True HA10083018 +Cube CUBESET Defines a calculated set of members or tuples by sending a set expression to the cube on the server True HA10083019 +Cube CUBERANKEDMEMBER Returns the nth, or ranked, member in a set True HA10083020 +Cube CUBEKPIMEMBER Returns a key performance indicator name, property, and measure, and displays the name and property in the cell. True HA10083021 +Cube CUBEMEMBERPROPERTY Returns the value of a member property in the cube True HA10083023 +Cube CUBESETCOUNT Returns the number of items in a set True HA10083024 Database DAVERAGE Averages the values in a column of a list or database that match conditions you specify HP10062266 Database DCOUNT Counts the cells that contain numbers in a column of a list or database that match conditions you specify HP10062267 Database DCOUNTA Counts the nonblank cells in a column of a list or database that match conditions you specify HP10062268 @@ -144,7 +144,7 @@ Information TYPE Returns a number indicating the data type of a value HP10062400 Logical Returns the logical value FALSE HP10062401 Logical NOT Reverses the logic of its argument HP10062402 Logical OR Returns TRUE if any argument is TRUE HP10062403 -Logical 1 Returns the logical value TRUE HP10062404 +Logical True Returns the logical value TRUE HP10062404 Lookup & Reference ADDRESS Returns a reference as text to a single cell in a worksheet HP10062407 Lookup & Reference AREAS Returns the number of areas in a reference HP10062408 Lookup & Reference COLUMN Returns the column number of a reference HP10062409 @@ -342,15 +342,15 @@ Math & Trig MMULT Returns the matrix product of two arrays HP10069842 New (All) Row Labels Count of Function -Statistical 83.0 -Math & Trig 60.0 -Financial 53.0 -Engineering 39.0 -Text 24.0 -Date & Time 20.0 -Lookup & Reference 18.0 -Information 17.0 -Database 12.0 -Cube 7.0 -Logical 7.0 -Grand Total 340.0 +Statistical 83 +Math & Trig 60 +Financial 53 +Engineering 39 +Text 24 +Date & Time 20 +Lookup & Reference 18 +Information 17 +Database 12 +Cube 7 +Logical 7 +Grand Total 340 diff --git a/textract/__init__.py b/textract/__init__.py index 046c067c..82252701 100644 --- a/textract/__init__.py +++ b/textract/__init__.py @@ -1,3 +1,3 @@ from .parsers import process -VERSION = "1.6.5" +VERSION = "1.7.0" diff --git a/textract/parsers/xls_parser.py b/textract/parsers/xls_parser.py index f305b60e..166c3539 100644 --- a/textract/parsers/xls_parser.py +++ b/textract/parsers/xls_parser.py @@ -1 +1,33 @@ -from .xlsx_parser import Parser +import xlrd +import six + +from six.moves import xrange + +from .utils import BaseParser + + +class Parser(BaseParser): + """Extract text from Excel files (.xls/xlsx). + """ + + def extract(self, filename, **kwargs): + workbook = xlrd.open_workbook(filename) + sheets_name = workbook.sheet_names() + output = "\n" + for names in sheets_name: + worksheet = workbook.sheet_by_name(names) + num_rows = worksheet.nrows + num_cells = worksheet.ncols + + for curr_row in range(num_rows): + row = worksheet.row(curr_row) + new_output = [] + for index_col in xrange(num_cells): + value = worksheet.cell_value(curr_row, index_col) + if value: + if isinstance(value, (int, float)): + value = six.text_type(value) + new_output.append(value) + if new_output: + output += u' '.join(new_output) + u'\n' + return output diff --git a/textract/parsers/xlsx_parser.py b/textract/parsers/xlsx_parser.py index 166c3539..6e10d48d 100644 --- a/textract/parsers/xlsx_parser.py +++ b/textract/parsers/xlsx_parser.py @@ -1,7 +1,4 @@ -import xlrd -import six - -from six.moves import xrange +import openpyxl from .utils import BaseParser @@ -11,22 +8,20 @@ class Parser(BaseParser): """ def extract(self, filename, **kwargs): - workbook = xlrd.open_workbook(filename) - sheets_name = workbook.sheet_names() + workbook = openpyxl.load_workbook(filename=filename, read_only=True, data_only=True) + sheets_name = workbook.sheetnames output = "\n" + for names in sheets_name: - worksheet = workbook.sheet_by_name(names) - num_rows = worksheet.nrows - num_cells = worksheet.ncols + worksheet = workbook[names] - for curr_row in range(num_rows): - row = worksheet.row(curr_row) + for row in worksheet.iter_rows(): new_output = [] - for index_col in xrange(num_cells): - value = worksheet.cell_value(curr_row, index_col) + for cell in row: + value = cell.value if value: if isinstance(value, (int, float)): - value = six.text_type(value) + value = str(value) new_output.append(value) if new_output: output += u' '.join(new_output) + u'\n'