forked from nlmatics/nlm-ingestor
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Dockerfile-lambda
48 lines (43 loc) · 1.91 KB
/
Dockerfile-lambda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# syntax=docker/dockerfile:experimental
FROM python:3.11-bookworm
RUN apt-get update && apt-get -y --no-install-recommends install libgomp1
ENV APP_HOME /app
# install Java
RUN mkdir -p /usr/share/man/man1 && \
apt-get update -y && \
apt-get install -y openjdk-17-jre-headless
# install essential packages
RUN apt-get install -y \
libxml2-dev libxslt-dev \
build-essential libmagic-dev
# install tesseract
RUN apt-get install -y \
tesseract-ocr \
lsb-release \
&& echo "deb https://notesalexp.org/tesseract-ocr5/$(lsb_release -cs)/ $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/notesalexp.list > /dev/null \
&& apt-get update -oAcquire::AllowInsecureRepositories=true \
&& apt-get install notesalexp-keyring -oAcquire::AllowInsecureRepositories=true -y --allow-unauthenticated \
&& apt-get update \
&& apt-get install -y \
tesseract-ocr libtesseract-dev \
&& wget -P /usr/share/tesseract-ocr/5/tessdata/ https://github.com/tesseract-ocr/tessdata/raw/main/eng.traineddata
RUN apt-get install unzip -y && \
apt-get install git -y && \
apt-get autoremove -y
WORKDIR ${APP_HOME}
COPY ./requirements.txt ./requirements.txt
RUN pip install --upgrade pip setuptools
RUN apt-get install -y libmagic1
RUN mkdir -p -m 0600 ~/.ssh && ssh-keyscan github.com >> ~/.ssh/known_hosts
RUN pip install -r requirements.txt
# Set NLTK Data directory environment variable to ensure it uses a known location
RUN mkdir -p /usr/local/share/nltk_data && chmod a+rwx /usr/local/share/nltk_data
ENV NLTK_DATA /usr/local/share/nltk_data
# Download necessary NLTK data using the defined base directory
RUN python -m nltk.downloader -d /usr/local/share/nltk_data stopwords
RUN python -m nltk.downloader -d /usr/local/share/nltk_data punkt
RUN pip install awslambdaric
COPY . ./
ENTRYPOINT [ "/usr/local/bin/python", "-m", "awslambdaric" ]
# Set up the command for the Lambda handler
CMD [ "handler.parse" ]