-
Notifications
You must be signed in to change notification settings - Fork 462
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix outdated Dockerfile and Flask app (#251)
* Add requirements needed for laser_encoders * Add script to use laser_encoder * Update flask app to use laser_encoders * Update Dockerfile to build image for laser_encoder * Update README to setup docker * Update README * style: Format code and sort imports using black and isort * Update Dockerfile to include maintainer * Update README for docker setup * Remove unesssary file in docker directory * Enable pip installing laser_encoders from local directory * Fix pip install error while building docker container * Add error handling for unsupported languages in /vectorize endpoint * Add language model download to Docker build process * Create cache for encoder to improve subsequent request speed * Add build arguments to predownload encoders and tokenizers * Update README on usage * Update README * Change default lang to 2 letter code * Update README to indicate language used in default build * Update Dockerfile to use toml file instead of requirements file * Improve caching for laser2 languages * Fix faulty caching logic
- Loading branch information
1 parent
995c2f7
commit dc1c68e
Showing
4 changed files
with
131 additions
and
124 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,65 +2,37 @@ FROM continuumio/miniconda3 | |
|
||
MAINTAINER Gilles Bodart <[email protected]> | ||
|
||
RUN conda create -n env python=3.6 | ||
# Install build-essential (compiler and development tools) | ||
RUN apt-get update && \ | ||
apt-get install -y build-essential && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
RUN conda create -n env python=3.8 | ||
RUN echo "source activate env" > ~/.bashrc | ||
ENV PATH /opt/conda/envs/env/bin:$PATH | ||
|
||
RUN apt-get -qq -y update | ||
RUN apt-get -qq -y upgrade | ||
RUN apt-get -qq -y install \ | ||
gcc \ | ||
g++ \ | ||
wget \ | ||
curl \ | ||
git \ | ||
make \ | ||
unzip \ | ||
sudo \ | ||
vim | ||
|
||
# Use C.UTF-8 locale to avoid issues with ASCII encoding | ||
ENV LC_ALL=C.UTF-8 | ||
ENV LANG=C.UTF-8 | ||
|
||
# Set the working directory to /app | ||
WORKDIR /app | ||
|
||
COPY ./requirements.txt /app/requirements.txt | ||
|
||
# Install any needed packages specified in requirements.txt | ||
RUN pip install --trusted-host pypi.python.org -r requirements.txt --verbose | ||
|
||
|
||
# Download LASER from FB | ||
RUN git clone https://github.com/facebookresearch/LASER.git | ||
|
||
ENV LASER /app/LASER | ||
WORKDIR $LASER | ||
|
||
RUN bash ./install_models.sh | ||
# Copy the local laser-encoders repository | ||
COPY laser_encoders /app/laser_encoders | ||
COPY pyproject.toml /app/pyproject.toml | ||
|
||
RUN pip install --upgrade pip | ||
RUN pip install -e . | ||
RUN pip install Flask==2.3.3 Requests==2.31.0 | ||
|
||
#Installing FAISS | ||
|
||
RUN conda install --name env -c pytorch faiss-cpu -y | ||
|
||
RUN bash ./install_external_tools.sh | ||
|
||
COPY ./decode.py $LASER/tasks/embed/decode.py | ||
|
||
|
||
# Make port 80 available to the world outside this container | ||
WORKDIR /app | ||
|
||
RUN echo "Hello World" > test.txt | ||
# Define the argument for language | ||
ARG langs="eng_Latn" | ||
|
||
RUN $LASER/tasks/embed/embed.sh test.txt en test_embed.raw | ||
RUN python $LASER/tasks/embed/decode.py test_embed.raw | ||
# Download language models for each specified language | ||
RUN for lang in $langs; do \ | ||
python -m laser_encoders.download_models --lang=$lang; \ | ||
done | ||
|
||
#Open the port 80 | ||
# Open the port 80 | ||
EXPOSE 80 | ||
|
||
COPY ./app.py /app/app.py | ||
COPY docker/app.py /app/app.py | ||
|
||
CMD ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,64 @@ | ||
#!/usr/bin/env python | ||
# -*- coding: utf-8 -*- | ||
from flask import Flask, request, jsonify | ||
import os | ||
import socket | ||
import tempfile | ||
from pathlib import Path | ||
import numpy as np | ||
from LASER.source.lib.text_processing import Token, BPEfastApply | ||
from LASER.source.embed import * | ||
|
||
from flask import Flask, jsonify, request | ||
|
||
from laser_encoders import LaserEncoderPipeline | ||
from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE | ||
|
||
app = Flask(__name__) | ||
app.config['JSON_AS_ASCII'] = False | ||
|
||
# Global cache for encoders | ||
encoder_cache = {} | ||
|
||
laser2_encoder = None | ||
|
||
|
||
@app.route("/") | ||
def root(): | ||
print("/") | ||
html = "<h3>Hello {name}!</h3>" \ | ||
"<b>Hostname:</b> {hostname}<br/>" | ||
html = "<h3>Hello {name}!</h3>" "<b>Hostname:</b> {hostname}<br/>" | ||
return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname()) | ||
|
||
|
||
@app.route("/vectorize") | ||
@app.route("/vectorize", methods=["GET"]) | ||
def vectorize(): | ||
content = request.args.get('q') | ||
lang = request.args.get('lang') | ||
embedding = '' | ||
if lang is None or not lang: | ||
lang = "en" | ||
# encoder | ||
model_dir = Path(__file__).parent / "LASER" / "models" | ||
encoder_path = model_dir / "bilstm.93langs.2018-12-26.pt" | ||
bpe_codes_path = model_dir / "93langs.fcodes" | ||
print(f' - Encoder: loading {encoder_path}') | ||
encoder = SentenceEncoder(encoder_path, | ||
max_sentences=None, | ||
max_tokens=12000, | ||
sort_kind='mergesort', | ||
cpu=True) | ||
with tempfile.TemporaryDirectory() as tmp: | ||
tmpdir = Path(tmp) | ||
ifname = tmpdir / "content.txt" | ||
bpe_fname = tmpdir / 'bpe' | ||
bpe_oname = tmpdir / 'out.raw' | ||
with ifname.open("w") as f: | ||
f.write(content) | ||
if lang != '--': | ||
tok_fname = tmpdir / "tok" | ||
Token(str(ifname), | ||
str(tok_fname), | ||
lang=lang, | ||
romanize=True if lang == 'el' else False, | ||
lower_case=True, | ||
gzip=False, | ||
verbose=True, | ||
over_write=False) | ||
ifname = tok_fname | ||
BPEfastApply(str(ifname), | ||
str(bpe_fname), | ||
str(bpe_codes_path), | ||
verbose=True, over_write=False) | ||
ifname = bpe_fname | ||
EncodeFile(encoder, | ||
str(ifname), | ||
str(bpe_oname), | ||
verbose=True, | ||
over_write=False, | ||
buffer_size=10000) | ||
dim = 1024 | ||
X = np.fromfile(str(bpe_oname), dtype=np.float32, count=-1) | ||
X.resize(X.shape[0] // dim, dim) | ||
embedding = X | ||
body = {'content': content, 'embedding': embedding.tolist()} | ||
return jsonify(body) | ||
content = request.args.get("q") | ||
lang = request.args.get( | ||
"lang", "eng" | ||
) # Default to English if 'lang' is not provided | ||
|
||
if content is None: | ||
return jsonify({"error": "Missing input content"}), 400 | ||
|
||
try: | ||
global laser2_encoder | ||
if lang in LASER2_LANGUAGE: # Checks for both 3-letter code or 8-letter code | ||
if not laser2_encoder: | ||
laser2_encoder = LaserEncoderPipeline(lang=lang) | ||
encoder = laser2_encoder | ||
else: | ||
lang_code = LASER3_LANGUAGE.get( | ||
lang, lang | ||
) # Use language code as key to prevent multiple entries for same language | ||
if lang_code not in encoder_cache: | ||
encoder_cache[lang_code] = LaserEncoderPipeline(lang=lang_code) | ||
encoder = encoder_cache[lang_code] | ||
|
||
embeddings = encoder.encode_sentences([content]) | ||
embeddings_list = embeddings.tolist() | ||
body = {"content": content, "embedding": embeddings_list} | ||
return jsonify(body), 200 | ||
|
||
except ValueError as e: | ||
# Check if the exception is due to an unsupported language | ||
if "unsupported language" in str(e).lower(): | ||
return jsonify({"error": f"Language '{lang}' is not supported."}), 400 | ||
else: | ||
return jsonify({"error": str(e)}), 400 | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(debug=True, port=80, host='0.0.0.0') | ||
app.run(debug=True, port=80, host="0.0.0.0") |
This file was deleted.
Oops, something went wrong.