diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5137acbb0..cdc675e2b 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.0.2 +current_version = 4.0.3-rc4 tag_name = {new_version} commit = True tag = True diff --git a/Dockerfile b/Dockerfile index e0bc8b66e..20d2c6fb0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,7 +22,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source # image processing, djvu mdbtools djvulibre-bin \ libtiff5-dev \ - libtiff-tools ghostscript librsvg2-bin jbig2dec \ + libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \ pst-utils libgif-dev \ ### tesseract tesseract-ocr-eng \ @@ -121,7 +121,7 @@ RUN mkdir /models/ && \ curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz" COPY requirements.txt /tmp/ -RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt +RUN pip3 install --no-cache-dir --no-binary "tesserocr" --no-binary "Pillow" -r /tmp/requirements.txt # Install spaCy models RUN python3 -m spacy download en_core_web_sm \ @@ -147,11 +147,10 @@ RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep5 RUN chown -R app:app /ingestors ENV ARCHIVE_TYPE=file \ - ARCHIVE_PATH=/data \ - FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ - REDIS_URL=redis://redis:6379/0 \ - TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \ - LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 + ARCHIVE_PATH=/data \ + FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ + REDIS_URL=redis://redis:6379/0 \ + TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata # USER app CMD ingestors process diff --git a/Makefile b/Makefile index c6506a62c..b259eabe2 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ INGEST=ghcr.io/alephdata/ingest-file COMPOSE=docker compose -DOCKER=$(COMPOSE) run --rm ingest-file +DOCKER=$(COMPOSE) run --rm -e LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 ingest-file .PHONY: build diff --git a/ingestors/__init__.py b/ingestors/__init__.py index 48b76feca..2c9d440c5 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.0.2" +__version__ = "4.0.3-rc4" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/ingestors/media/image.py b/ingestors/media/image.py index af358397e..43f8096f2 100644 --- a/ingestors/media/image.py +++ b/ingestors/media/image.py @@ -1,6 +1,6 @@ import logging from io import BytesIO -from PIL import Image, ExifTags +from PIL import Image, ExifTags, ImageFile from followthemoney import model from ingestors.ingestor import Ingestor @@ -10,6 +10,9 @@ log = logging.getLogger(__name__) +# from https://stackoverflow.com/a/47958486 +ImageFile.LOAD_TRUNCATED_IMAGES = True + class ImageIngestor(Ingestor, OCRSupport, TimestampSupport): """Image file ingestor class. Extracts the text from images using OCR.""" diff --git a/ingestors/support/ocr.py b/ingestors/support/ocr.py index f40ce7a2c..27260eda6 100644 --- a/ingestors/support/ocr.py +++ b/ingestors/support/ocr.py @@ -3,7 +3,7 @@ import threading from hashlib import sha1 from normality import stringify -from PIL import Image +from PIL import Image, ImageFile from io import BytesIO from languagecodes import list_to_alpha3 as alpha3 @@ -13,6 +13,7 @@ log = logging.getLogger(__name__) TESSERACT_LOCALE = "C" +ImageFile.LOAD_TRUNCATED_IMAGES = True class OCRSupport(CacheSupport): diff --git a/ingestors/support/shell.py b/ingestors/support/shell.py index 661cfe023..b34b5404a 100644 --- a/ingestors/support/shell.py +++ b/ingestors/support/shell.py @@ -1,7 +1,7 @@ import os +import shutil import subprocess from servicelayer import env -from distutils.spawn import find_executable from ingestors.util import path_string from ingestors.exc import ProcessingException @@ -17,7 +17,7 @@ class ShellSupport(object): def find_command(self, name): config_name = "%s_BIN" % name config_name = config_name.replace("-", "_").upper() - return env.get(config_name, find_executable(name)) + return env.get(config_name, shutil.which(name)) def exec_command(self, command, *args): binary = self.find_command(command) diff --git a/requirements.txt b/requirements.txt index d36781893..7ba7eadaf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ normality==2.5.0 pantomime==0.6.1 followthemoney==3.5.9 followthemoney-store[postgresql]==3.1.0 -servicelayer[google,amazon]==1.23.2 +servicelayer[google,amazon]==1.23.3-rc7 languagecodes==1.1.1 countrytagger==0.1.2 pyicu==2.12 @@ -31,7 +31,7 @@ odfpy==1.4.1 cchardet==2.1.7 lxml==5.0.0 olefile==0.47 -Pillow==10.1.0 +Pillow==10.4.0 vobject==0.9.6.1 msglite==0.30.0 icalendar==5.0.12 @@ -41,4 +41,4 @@ requests[security]==2.31.0 pymupdf==1.21.1 prometheus-client==0.17.1 -sentry_sdk==2.0.1 +sentry_sdk==2.19.2 diff --git a/setup.py b/setup.py index 16f56ca85..1e82aee0f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.0.2", + version="4.0.3-rc4", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"},