diff --git a/docker/Dockerfile b/docker/Dockerfile index 449fb913..bf23e5f9 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -2,65 +2,37 @@ FROM continuumio/miniconda3 MAINTAINER Gilles Bodart -RUN conda create -n env python=3.6 +# Install build-essential (compiler and development tools) +RUN apt-get update && \ + apt-get install -y build-essential && \ + rm -rf /var/lib/apt/lists/* + +RUN conda create -n env python=3.8 RUN echo "source activate env" > ~/.bashrc ENV PATH /opt/conda/envs/env/bin:$PATH -RUN apt-get -qq -y update -RUN apt-get -qq -y upgrade -RUN apt-get -qq -y install \ - gcc \ - g++ \ - wget \ - curl \ - git \ - make \ - unzip \ - sudo \ - vim - -# Use C.UTF-8 locale to avoid issues with ASCII encoding -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 - # Set the working directory to /app WORKDIR /app -COPY ./requirements.txt /app/requirements.txt - -# Install any needed packages specified in requirements.txt -RUN pip install --trusted-host pypi.python.org -r requirements.txt --verbose - - -# Download LASER from FB -RUN git clone https://github.com/facebookresearch/LASER.git - -ENV LASER /app/LASER -WORKDIR $LASER - -RUN bash ./install_models.sh +# Copy the local laser-encoders repository +COPY laser_encoders /app/laser_encoders +COPY pyproject.toml /app/pyproject.toml +RUN pip install --upgrade pip +RUN pip install -e . +RUN pip install Flask==2.3.3 Requests==2.31.0 -#Installing FAISS - -RUN conda install --name env -c pytorch faiss-cpu -y - -RUN bash ./install_external_tools.sh - -COPY ./decode.py $LASER/tasks/embed/decode.py - - -# Make port 80 available to the world outside this container -WORKDIR /app - -RUN echo "Hello World" > test.txt +# Define the argument for language +ARG langs="eng_Latn" -RUN $LASER/tasks/embed/embed.sh test.txt en test_embed.raw -RUN python $LASER/tasks/embed/decode.py test_embed.raw +# Download language models for each specified language +RUN for lang in $langs; do \ + python -m laser_encoders.download_models --lang=$lang; \ + done -#Open the port 80 +# Open the port 80 EXPOSE 80 -COPY ./app.py /app/app.py +COPY docker/app.py /app/app.py CMD ["/bin/bash"] diff --git a/docker/README.md b/docker/README.md index 57e18c82..1c67e49f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,19 +1,74 @@ -## Docker +## LASER Docker Image -An image docker has been created to help you with the settings of an environment here are the step to follow : +This image provides a convenient way to run LASER in a Docker container. -* Open a command prompt on the root of your LASER project -* Execute the command `docker build --tag=laser docker` -* Once the image is built run `docker run -it laser` +### Building the image +To build the image, run the following command from the root of the LASER directory: -A REST server on top of the embed task is under developement, -to run it you'll have to expose a local port [CHANGEME_LOCAL_PORT] by executing the next line instead of the last command. It'll overinde the command line entrypoint of your docker container. +``` +docker build --tag laser -f docker/Dockerfile . +``` +### Specifying Languages with `langs` Argument -* `docker run -p [CHANGEME_LOCAL_PORT]:80 -it laser python app.py` +You can pre-download the encoders and tokenizers for specific languages by using the `langs` build argument. This argument accepts a space-separated list of language codes. For example, to build an image with models for English and French, use the following command: +``` +docker build --build-arg langs="eng_Latn fra_Latn" -t laser -f docker/Dockerfile . +``` +If the `langs` argument is not specified during the build process, the image will default to building with English (`eng_Latn`). It's important to note that in this default case where English is selected, the LASER2 model, which supports 92 languages, is used. For a comprehensive list of LASER2 supported languages, refer to `LASER2_LANGUAGES_LIST` in [`language_list.py`](https://github.com/facebookresearch/LASER/blob/main/laser_encoders/language_list.py). + + +### Running the Image +Once the image is built, you can run it with the following command: + +``` +docker run -it laser +``` +**Note:** If you want to expose a local port to the REST server on top of the embed task, you can do so by executing the following command instead of the last command: + +``` +docker run -it -p [CHANGEME_LOCAL_PORT]:80 laser python app.py +``` +This will override the command line entrypoint of the Docker container. + +Example: + +``` +docker run -it -p 8081:80 laser python app.py +``` This Flask server will serve a REST Api that can be use by calling your server with this URL : -* http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE] +``` +http://127.0.0.1:[CHANGEME_LOCAL_PORT]/vectorize?q=[YOUR_SENTENCE_URL_ENCODED]&lang=[LANGUAGE] +``` + +Example: + +``` +http://127.0.0.1:8081/vectorize?q=ki%20lo%20'orukọ%20ẹ&lang=yor +``` + +Sample response: +``` +{ + "content": "ki lo 'orukọ ẹ", + "embedding": [ + [ + -0.10241681337356567, + 0.11120740324258804, + -0.26641348004341125, + -0.055699944496154785, + .... + .... + .... + -0.034048307687044144, + 0.11005636304616928, + -0.3238321840763092, + -0.060631975531578064, + -0.19269055128097534, + ] +} +``` Here is an example of how you can send requests to it with python: diff --git a/docker/app.py b/docker/app.py index a5574b9a..ac318c30 100644 --- a/docker/app.py +++ b/docker/app.py @@ -1,78 +1,64 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from flask import Flask, request, jsonify import os import socket -import tempfile -from pathlib import Path -import numpy as np -from LASER.source.lib.text_processing import Token, BPEfastApply -from LASER.source.embed import * + +from flask import Flask, jsonify, request + +from laser_encoders import LaserEncoderPipeline +from laser_encoders.language_list import LASER2_LANGUAGE, LASER3_LANGUAGE app = Flask(__name__) -app.config['JSON_AS_ASCII'] = False + +# Global cache for encoders +encoder_cache = {} + +laser2_encoder = None @app.route("/") def root(): print("/") - html = "

Hello {name}!

" \ - "Hostname: {hostname}
" + html = "

Hello {name}!

" "Hostname: {hostname}
" return html.format(name=os.getenv("LASER", "world"), hostname=socket.gethostname()) -@app.route("/vectorize") +@app.route("/vectorize", methods=["GET"]) def vectorize(): - content = request.args.get('q') - lang = request.args.get('lang') - embedding = '' - if lang is None or not lang: - lang = "en" - # encoder - model_dir = Path(__file__).parent / "LASER" / "models" - encoder_path = model_dir / "bilstm.93langs.2018-12-26.pt" - bpe_codes_path = model_dir / "93langs.fcodes" - print(f' - Encoder: loading {encoder_path}') - encoder = SentenceEncoder(encoder_path, - max_sentences=None, - max_tokens=12000, - sort_kind='mergesort', - cpu=True) - with tempfile.TemporaryDirectory() as tmp: - tmpdir = Path(tmp) - ifname = tmpdir / "content.txt" - bpe_fname = tmpdir / 'bpe' - bpe_oname = tmpdir / 'out.raw' - with ifname.open("w") as f: - f.write(content) - if lang != '--': - tok_fname = tmpdir / "tok" - Token(str(ifname), - str(tok_fname), - lang=lang, - romanize=True if lang == 'el' else False, - lower_case=True, - gzip=False, - verbose=True, - over_write=False) - ifname = tok_fname - BPEfastApply(str(ifname), - str(bpe_fname), - str(bpe_codes_path), - verbose=True, over_write=False) - ifname = bpe_fname - EncodeFile(encoder, - str(ifname), - str(bpe_oname), - verbose=True, - over_write=False, - buffer_size=10000) - dim = 1024 - X = np.fromfile(str(bpe_oname), dtype=np.float32, count=-1) - X.resize(X.shape[0] // dim, dim) - embedding = X - body = {'content': content, 'embedding': embedding.tolist()} - return jsonify(body) + content = request.args.get("q") + lang = request.args.get( + "lang", "eng" + ) # Default to English if 'lang' is not provided + + if content is None: + return jsonify({"error": "Missing input content"}), 400 + + try: + global laser2_encoder + if lang in LASER2_LANGUAGE: # Checks for both 3-letter code or 8-letter code + if not laser2_encoder: + laser2_encoder = LaserEncoderPipeline(lang=lang) + encoder = laser2_encoder + else: + lang_code = LASER3_LANGUAGE.get( + lang, lang + ) # Use language code as key to prevent multiple entries for same language + if lang_code not in encoder_cache: + encoder_cache[lang_code] = LaserEncoderPipeline(lang=lang_code) + encoder = encoder_cache[lang_code] + + embeddings = encoder.encode_sentences([content]) + embeddings_list = embeddings.tolist() + body = {"content": content, "embedding": embeddings_list} + return jsonify(body), 200 + + except ValueError as e: + # Check if the exception is due to an unsupported language + if "unsupported language" in str(e).lower(): + return jsonify({"error": f"Language '{lang}' is not supported."}), 400 + else: + return jsonify({"error": str(e)}), 400 + if __name__ == "__main__": - app.run(debug=True, port=80, host='0.0.0.0') + app.run(debug=True, port=80, host="0.0.0.0") diff --git a/docker/requirements.txt b/docker/requirements.txt deleted file mode 100644 index 2b38074b..00000000 --- a/docker/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -Flask -scipy -numpy -Cython -torch -transliterate \ No newline at end of file