From d06325ef38fc49e99087e9f942b37c4f7dc9c021 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Fri, 13 Dec 2024 14:20:22 +0100 Subject: [PATCH 01/13] Use servicelayer 1.23.3-rc1 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 23945fd35..761928f3b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ normality==2.5.0 pantomime==0.6.1 followthemoney==3.5.9 followthemoney-store[postgresql]==3.1.0 -servicelayer[google,amazon]==1.23.2 +servicelayer[google,amazon]==1.23.3-rc1 languagecodes==1.1.1 countrytagger==0.1.2 pyicu==2.12 From 5a017d75a71c864477f2d73fc55f3c66f123ae76 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Fri, 13 Dec 2024 15:04:07 +0100 Subject: [PATCH 02/13] =?UTF-8?q?Bump=20version:=204.0.2=20=E2=86=92=204.0?= =?UTF-8?q?.3-rc1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- ingestors/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5137acbb0..417542200 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.0.2 +current_version = 4.0.3-rc1 tag_name = {new_version} commit = True tag = True diff --git a/ingestors/__init__.py b/ingestors/__init__.py index 48b76feca..b556d4015 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.0.2" +__version__ = "4.0.3-rc1" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/setup.py b/setup.py index 16f56ca85..84971e03a 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.0.2", + version="4.0.3-rc1", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"}, From 8bee1688f625592b74b77dc4595f8ab0bee53e00 Mon Sep 17 00:00:00 2001 From: Alex Stefanescu Date: Thu, 19 Dec 2024 14:55:46 +0100 Subject: [PATCH 03/13] Bump servicelayer to 1.23.3-rc5 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 761928f3b..aca9be3a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ normality==2.5.0 pantomime==0.6.1 followthemoney==3.5.9 followthemoney-store[postgresql]==3.1.0 -servicelayer[google,amazon]==1.23.3-rc1 +servicelayer[google,amazon]==1.23.3-rc5 languagecodes==1.1.1 countrytagger==0.1.2 pyicu==2.12 From 511586a0859be3d97a327669207de1de51bcfad7 Mon Sep 17 00:00:00 2001 From: Alex Stefanescu Date: Thu, 19 Dec 2024 15:06:21 +0100 Subject: [PATCH 04/13] =?UTF-8?q?Bump=20version:=204.0.3-rc1=20=E2=86=92?= =?UTF-8?q?=204.0.3-rc2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- ingestors/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 417542200..41ce35bc7 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.0.3-rc1 +current_version = 4.0.3-rc2 tag_name = {new_version} commit = True tag = True diff --git a/ingestors/__init__.py b/ingestors/__init__.py index b556d4015..401abcb0b 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.0.3-rc1" +__version__ = "4.0.3-rc2" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/setup.py b/setup.py index 84971e03a..346134ae3 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.0.3-rc1", + version="4.0.3-rc2", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"}, From 59ef249dc376edaaa1acaac92664e5506c9d37a4 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Mon, 6 Jan 2025 16:40:22 +0100 Subject: [PATCH 05/13] servicelayer 1.23.3-rc6 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index aca9be3a1..a26ca273c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ normality==2.5.0 pantomime==0.6.1 followthemoney==3.5.9 followthemoney-store[postgresql]==3.1.0 -servicelayer[google,amazon]==1.23.3-rc5 +servicelayer[google,amazon]==1.23.3-rc6 languagecodes==1.1.1 countrytagger==0.1.2 pyicu==2.12 From f7041bbc24dd57558d07c5a52c0383dc0b9bd6e0 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Mon, 6 Jan 2025 16:44:12 +0100 Subject: [PATCH 06/13] =?UTF-8?q?Bump=20version:=204.0.3-rc2=20=E2=86=92?= =?UTF-8?q?=204.0.3-rc3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- ingestors/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 41ce35bc7..ae425943d 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.0.3-rc2 +current_version = 4.0.3-rc3 tag_name = {new_version} commit = True tag = True diff --git a/ingestors/__init__.py b/ingestors/__init__.py index 401abcb0b..be3d14d06 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.0.3-rc2" +__version__ = "4.0.3-rc3" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/setup.py b/setup.py index 346134ae3..dcf4d55a1 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.0.3-rc2", + version="4.0.3-rc3", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"}, From cfc3200ca56896f45d651d5e64b2a986b9965819 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 7 Jan 2025 15:25:00 +0100 Subject: [PATCH 07/13] servicelayer 1.23.3-rc7 and sentry-sdk 2.19.2 --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a26ca273c..276b8500f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ normality==2.5.0 pantomime==0.6.1 followthemoney==3.5.9 followthemoney-store[postgresql]==3.1.0 -servicelayer[google,amazon]==1.23.3-rc6 +servicelayer[google,amazon]==1.23.3-rc7 languagecodes==1.1.1 countrytagger==0.1.2 pyicu==2.12 @@ -40,4 +40,4 @@ requests[security]==2.31.0 pymupdf==1.21.1 prometheus-client==0.17.1 -sentry_sdk==2.0.1 +sentry_sdk==2.19.2 From 211f61aac0a067a24c470b1a787aa3ff9a42032f Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 7 Jan 2025 15:25:05 +0100 Subject: [PATCH 08/13] =?UTF-8?q?Bump=20version:=204.0.3-rc3=20=E2=86=92?= =?UTF-8?q?=204.0.3-rc4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- ingestors/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index ae425943d..cdc675e2b 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.0.3-rc3 +current_version = 4.0.3-rc4 tag_name = {new_version} commit = True tag = True diff --git a/ingestors/__init__.py b/ingestors/__init__.py index be3d14d06..2c9d440c5 100644 --- a/ingestors/__init__.py +++ b/ingestors/__init__.py @@ -2,7 +2,7 @@ import logging -__version__ = "4.0.3-rc3" +__version__ = "4.0.3-rc4" logging.getLogger("chardet").setLevel(logging.INFO) logging.getLogger("PIL").setLevel(logging.INFO) diff --git a/setup.py b/setup.py index dcf4d55a1..1e82aee0f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name="ingest", - version="4.0.3-rc3", + version="4.0.3-rc4", author="Organized Crime and Corruption Reporting Project", packages=find_packages(exclude=["tests"]), package_dir={"ingestors": "ingestors"}, From 6f53c107d0b2b2e67e20781052e040063fc2b387 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Wed, 8 Jan 2025 15:52:09 +0100 Subject: [PATCH 09/13] chore: update pillow to 10.3.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 276b8500f..b94794a3d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,7 +30,7 @@ odfpy==1.4.1 cchardet==2.1.7 lxml==5.0.0 olefile==0.47 -Pillow==10.1.0 +Pillow==10.3.0 vobject==0.9.6.1 msglite==0.30.0 icalendar==5.0.12 From eb0a6d0cc625ae2df1041b0a6f1f557536c59c94 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Fri, 10 Jan 2025 15:26:03 +0100 Subject: [PATCH 10/13] move mac OS related LD_PRELOAD to Makefile --- Dockerfile | 3 +-- Makefile | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index ed3c377fe..e22854379 100644 --- a/Dockerfile +++ b/Dockerfile @@ -155,8 +155,7 @@ ENV ARCHIVE_TYPE=file \ ARCHIVE_PATH=/data \ FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ REDIS_URL=redis://redis:6379/0 \ - TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata \ - LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 + TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata # USER app CMD ingestors process diff --git a/Makefile b/Makefile index 53eb12e80..d97f2af2d 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ INGEST=ghcr.io/alephdata/ingest-file COMPOSE=docker compose -DOCKER=$(COMPOSE) run --rm ingest-file +DOCKER=$(COMPOSE) run --rm -e LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 ingest-file .PHONY: build From d3437394068769b798a1a1b6eec761df39057ab0 Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 14 Jan 2025 15:04:41 +0100 Subject: [PATCH 11/13] Fix deprecated `which` lookup --- ingestors/support/shell.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ingestors/support/shell.py b/ingestors/support/shell.py index 661cfe023..b34b5404a 100644 --- a/ingestors/support/shell.py +++ b/ingestors/support/shell.py @@ -1,7 +1,7 @@ import os +import shutil import subprocess from servicelayer import env -from distutils.spawn import find_executable from ingestors.util import path_string from ingestors.exc import ProcessingException @@ -17,7 +17,7 @@ class ShellSupport(object): def find_command(self, name): config_name = "%s_BIN" % name config_name = config_name.replace("-", "_").upper() - return env.get(config_name, find_executable(name)) + return env.get(config_name, shutil.which(name)) def exec_command(self, command, *args): binary = self.find_command(command) From 05ffe34e4ee3e49e4e954dbe29ed69d8b885b54d Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 14 Jan 2025 15:05:50 +0100 Subject: [PATCH 12/13] bugfix: allow loading truncated images, enable Pillow 10.4.0 --- Dockerfile | 4 ++-- ingestors/media/image.py | 5 ++++- ingestors/support/ocr.py | 3 ++- requirements.txt | 2 +- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index e22854379..1c2bb1752 100644 --- a/Dockerfile +++ b/Dockerfile @@ -25,7 +25,7 @@ RUN apt-get -qq -y update \ # image processing, djvu imagemagick-common imagemagick mdbtools djvulibre-bin \ libtiff5-dev libjpeg-dev libfreetype6-dev libwebp-dev \ - libtiff-tools ghostscript librsvg2-bin jbig2dec \ + libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \ pst-utils \ ### tesseract tesseract-ocr-eng \ @@ -126,7 +126,7 @@ RUN mkdir /models/ && \ COPY requirements.txt /tmp/ RUN pip3 install --no-cache-dir --prefer-binary --upgrade pip RUN pip3 install --no-cache-dir --prefer-binary --upgrade setuptools wheel -RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt +RUN pip3 install --no-cache-dir --no-binary "tesserocr" --no-binary "Pillow" -r /tmp/requirements.txt # Install spaCy models RUN python3 -m spacy download en_core_web_sm \ diff --git a/ingestors/media/image.py b/ingestors/media/image.py index af358397e..43f8096f2 100644 --- a/ingestors/media/image.py +++ b/ingestors/media/image.py @@ -1,6 +1,6 @@ import logging from io import BytesIO -from PIL import Image, ExifTags +from PIL import Image, ExifTags, ImageFile from followthemoney import model from ingestors.ingestor import Ingestor @@ -10,6 +10,9 @@ log = logging.getLogger(__name__) +# from https://stackoverflow.com/a/47958486 +ImageFile.LOAD_TRUNCATED_IMAGES = True + class ImageIngestor(Ingestor, OCRSupport, TimestampSupport): """Image file ingestor class. Extracts the text from images using OCR.""" diff --git a/ingestors/support/ocr.py b/ingestors/support/ocr.py index f40ce7a2c..27260eda6 100644 --- a/ingestors/support/ocr.py +++ b/ingestors/support/ocr.py @@ -3,7 +3,7 @@ import threading from hashlib import sha1 from normality import stringify -from PIL import Image +from PIL import Image, ImageFile from io import BytesIO from languagecodes import list_to_alpha3 as alpha3 @@ -13,6 +13,7 @@ log = logging.getLogger(__name__) TESSERACT_LOCALE = "C" +ImageFile.LOAD_TRUNCATED_IMAGES = True class OCRSupport(CacheSupport): diff --git a/requirements.txt b/requirements.txt index b94794a3d..5956cc837 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,7 +30,7 @@ odfpy==1.4.1 cchardet==2.1.7 lxml==5.0.0 olefile==0.47 -Pillow==10.3.0 +Pillow==10.4.0 vobject==0.9.6.1 msglite==0.30.0 icalendar==5.0.12 From 056605bb8e212ac61b9d89578d3a261150dfbc4c Mon Sep 17 00:00:00 2001 From: Christian Stefanescu Date: Tue, 14 Jan 2025 16:26:35 +0100 Subject: [PATCH 13/13] bring in OS upgrade changes --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index bb0559a12..20d2c6fb0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -150,7 +150,7 @@ ENV ARCHIVE_TYPE=file \ ARCHIVE_PATH=/data \ FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \ REDIS_URL=redis://redis:6379/0 \ - TESSDATA_PREFIX=/usr/share/tesseract-ocr/4.00/tessdata + TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata # USER app CMD ingestors process