Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Release 4.0.3 #687

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 4.0.2
current_version = 4.0.3-rc4
tag_name = {new_version}
commit = True
tag = True
Expand Down
13 changes: 6 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ RUN echo "deb http://http.us.debian.org/debian stable non-free" >/etc/apt/source
# image processing, djvu
mdbtools djvulibre-bin \
libtiff5-dev \
libtiff-tools ghostscript librsvg2-bin jbig2dec \
libtiff-tools ghostscript librsvg2-bin jbig2dec libopenjp2-7-dev \
pst-utils libgif-dev \
### tesseract
tesseract-ocr-eng \
Expand Down Expand Up @@ -121,7 +121,7 @@ RUN mkdir /models/ && \
curl -o "/models/model_type_prediction.ftz" "https://public.data.occrp.org/develop/models/types/type-08012020-7a69d1b.ftz"

COPY requirements.txt /tmp/
RUN pip3 install --no-cache-dir --no-binary "tesserocr" -r /tmp/requirements.txt
RUN pip3 install --no-cache-dir --no-binary "tesserocr" --no-binary "Pillow" -r /tmp/requirements.txt

# Install spaCy models
RUN python3 -m spacy download en_core_web_sm \
Expand All @@ -147,11 +147,10 @@ RUN pip install --no-cache-dir --config-settings editable_mode=compat --use-pep5
RUN chown -R app:app /ingestors

ENV ARCHIVE_TYPE=file \
ARCHIVE_PATH=/data \
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
REDIS_URL=redis://redis:6379/0 \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata \
LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
ARCHIVE_PATH=/data \
FTM_STORE_URI=postgresql://aleph:aleph@postgres/aleph \
REDIS_URL=redis://redis:6379/0 \
TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata

# USER app
CMD ingestors process
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
INGEST=ghcr.io/alephdata/ingest-file
COMPOSE=docker compose
DOCKER=$(COMPOSE) run --rm ingest-file
DOCKER=$(COMPOSE) run --rm -e LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1 ingest-file

.PHONY: build

Expand Down
2 changes: 1 addition & 1 deletion ingestors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging

__version__ = "4.0.2"
__version__ = "4.0.3-rc4"

logging.getLogger("chardet").setLevel(logging.INFO)
logging.getLogger("PIL").setLevel(logging.INFO)
Expand Down
5 changes: 4 additions & 1 deletion ingestors/media/image.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from io import BytesIO
from PIL import Image, ExifTags
from PIL import Image, ExifTags, ImageFile
from followthemoney import model

from ingestors.ingestor import Ingestor
Expand All @@ -10,6 +10,9 @@

log = logging.getLogger(__name__)

# from https://stackoverflow.com/a/47958486
ImageFile.LOAD_TRUNCATED_IMAGES = True


class ImageIngestor(Ingestor, OCRSupport, TimestampSupport):
"""Image file ingestor class. Extracts the text from images using OCR."""
Expand Down
3 changes: 2 additions & 1 deletion ingestors/support/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import threading
from hashlib import sha1
from normality import stringify
from PIL import Image
from PIL import Image, ImageFile
from io import BytesIO
from languagecodes import list_to_alpha3 as alpha3

Expand All @@ -13,6 +13,7 @@

log = logging.getLogger(__name__)
TESSERACT_LOCALE = "C"
ImageFile.LOAD_TRUNCATED_IMAGES = True


class OCRSupport(CacheSupport):
Expand Down
4 changes: 2 additions & 2 deletions ingestors/support/shell.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import shutil
import subprocess
from servicelayer import env
from distutils.spawn import find_executable

from ingestors.util import path_string
from ingestors.exc import ProcessingException
Expand All @@ -17,7 +17,7 @@ class ShellSupport(object):
def find_command(self, name):
config_name = "%s_BIN" % name
config_name = config_name.replace("-", "_").upper()
return env.get(config_name, find_executable(name))
return env.get(config_name, shutil.which(name))

def exec_command(self, command, *args):
binary = self.find_command(command)
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ normality==2.5.0
pantomime==0.6.1
followthemoney==3.5.9
followthemoney-store[postgresql]==3.1.0
servicelayer[google,amazon]==1.23.2
servicelayer[google,amazon]==1.23.3-rc7
languagecodes==1.1.1
countrytagger==0.1.2
pyicu==2.12
Expand Down Expand Up @@ -31,7 +31,7 @@ odfpy==1.4.1
cchardet==2.1.7
lxml==5.0.0
olefile==0.47
Pillow==10.1.0
Pillow==10.4.0
vobject==0.9.6.1
msglite==0.30.0
icalendar==5.0.12
Expand All @@ -41,4 +41,4 @@ requests[security]==2.31.0
pymupdf==1.21.1

prometheus-client==0.17.1
sentry_sdk==2.0.1
sentry_sdk==2.19.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="ingest",
version="4.0.2",
version="4.0.3-rc4",
author="Organized Crime and Corruption Reporting Project",
packages=find_packages(exclude=["tests"]),
package_dir={"ingestors": "ingestors"},
Expand Down
Loading