diff --git a/.github/workflows/docker-package.yml b/.github/workflows/docker-package.yml new file mode 100644 index 0000000..e0865b5 --- /dev/null +++ b/.github/workflows/docker-package.yml @@ -0,0 +1,31 @@ +name: Python package + +on: + push: + branches: [ "main" ] + +jobs: + + build_docker_image: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Get Version Tags + id: versions + run: | + echo "BACKEND_VERSION=$(echo "$(> "$GITHUB_OUTPUT" + - name: Docker Login + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build & push backend + uses: docker/build-push-action@v2 + with: + file: Dockerfile + push: true + tags: | + ghcr.io/scai-bio/backend:latest + ghcr.io/scai-bio/backend:${{ steps.versions.outputs.BACKEND_VERSION }} diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index b9324ca..7794281 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -35,27 +35,3 @@ jobs: - name: Test with pytest run: | pytest - - build_docker_image: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Get Version Tags - id: versions - run: | - echo "BACKEND_VERSION=$(echo "$(> "$GITHUB_OUTPUT" - - name: Docker Login - uses: docker/login-action@v1 - with: - registry: ghcr.io - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - name: Build & push backend - uses: docker/build-push-action@v2 - with: - file: Dockerfile - push: true - tags: | - ghcr.io/scai-bio/backend:latest - ghcr.io/scai-bio/backend:${{ steps.versions.outputs.BACKEND_VERSION }} diff --git a/README.md b/README.md index e9a9f04..c8c4b94 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,58 @@ # INDEX – the Intelligent Data Steward Toolbox -Intelligent data steward toolbox using Large Language Model embeddings for automated Data-Harmonization + +INDEX is an intelligent data steward toolbox that leverages Large Language Model embeddings for automated Data-Harmonization. + +## Table of Contents +- [Introduction](##ntroduction) +- [Installation & Usage](#installation) +- [Configuration](#configuration) + +## Introduction + +INDEX relies on vector embeddings calculated based on variable descriptions to generate mapping suggestions for any +dataset, enabling efficient and accurate data indexing and retrieval. Confirmed mappings are stored alongside their +vectorized representations in a knowledge base, facilitating rapid search and retrieval operations, ultimately enhancing +data management and analysis capabilities. New mappings may be added to the knowledge base in an iterative procedure, +allowing for improved mapping suggestions in subsequent harmonization tasks. + +## Installation & Usage + +Clone the repository: + +```bash +git clone https://github.com/SCAI-BIO/index +cd index +``` + +### Starting the Backend locally + +Install python requirements: + +```bash +pip install -r requirements.txt +``` + + +Run the Backend API on port 5000: + +```bash +uvicorn main:app --reload --port 5000 +``` + +### Run the Backend via Docker + +Download the latest docker build: + +```bash +docker pull ghcr.io/scai-bio/backend:latest +``` + +## Configuration + +### Description Embeddings + +You can configure INDEX to use either a local language model or call OPenAPIs embedding API. While using the OpenAI API +is significantly faster, you will need to provide an API key that is linked to your OpenAI account. + +Currently, the following local models are implemented: +* [MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet) \ No newline at end of file