feat(#63): add a chatgpt like app for searching on associations (#112)

Closes #63 --------- Co-authored-by: anekameni <[email protected]>
mongulu-cm · Dec 12, 2023 · 25b2b6c · 25b2b6c
1 parent eb89dba
commit 25b2b6c
Show file tree

Hide file tree

Showing 11 changed files with 403 additions and 46 deletions.
diff --git a/.gitignore b/.gitignore
@@ -182,3 +182,12 @@ package.json
 .aws/
 ## To avoid commit of .envrc in gitpod because this cause gitpod to execute .envrc instead of gitpod.yml
 .envrc
+etl/experiments/.chainlit/**
+etl/.chainlit/**
+etl/embeddings/**
+
+format.*
+
+# Ignore DevSpace/chainlit cache and log folder
+.devspace/
+.chainlit/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,21 @@
+# Use an official Python runtime as a parent image
+FROM python:3.11.7-bullseye
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Install any needed packages specified in requirements.txt
+COPY etl/requirements.txt /app/
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the current directory contents into the container at /app
+COPY . /app/
+
+# Make port 8000 available to the world outside this container
+EXPOSE 8000
+
+CMD cd etl && chainlit run experiments/ui.py
diff --git a/devspace.yaml b/devspace.yaml
@@ -0,0 +1,114 @@
+version: v2beta1
+name: tchoung-tegit
+
+# This is a list of `pipelines` that DevSpace can execute (you can define your own)
+pipelines:
+  # This is the pipeline for the main command: `devspace dev` (or `devspace run-pipeline dev`)
+  dev:
+    run: |-
+      run_dependencies --all       # 1. Deploy any projects this project needs (see "dependencies")
+      ensure_pull_secrets --all    # 2. Ensure pull secrets
+      create_deployments --all     # 3. Deploy Helm charts and manifests specfied as "deployments"
+      start_dev app                # 4. Start dev mode "app" (see "dev" section)
+  # You can run this pipeline via `devspace deploy` (or `devspace run-pipeline deploy`)
+  deploy:
+    run: |-
+      run_dependencies --all                            # 1. Deploy any projects this project needs (see "dependencies")
+      ensure_pull_secrets --all                         # 2. Ensure pull secrets
+      build_images --all -t $(git describe --always)    # 3. Build, tag (git commit hash) and push all images (see "images")
+      create_deployments --all                          # 4. Deploy Helm charts and manifests specfied as "deployments"
+
+# This is a list of `images` that DevSpace can build for this project
+# We recommend to skip image building during development (devspace dev) as much as possible
+images:
+  app:
+    image: mongulu/tchoung-te
+    dockerfile: ./Dockerfile
+
+# This is a list of `deployments` that DevSpace can create for this project
+deployments:
+  app:
+    # This deployment uses `helm` but you can also define `kubectl` deployments or kustomizations
+    helm:
+      # We are deploying this project with the Helm chart you provided
+      chart:
+        name: component-chart
+        repo: https://charts.devspace.sh
+      # Under `values` we can define the values for this Helm chart used during `helm install/upgrade`
+      # You may also use `valuesFiles` to load values from files, e.g. valuesFiles: ["values.yaml"]
+      values:
+        containers:
+          - image: mongulu/tchoung-te
+            env:
+              - name: OPENAI_API_KEY
+                valueFrom:
+                  secretKeyRef:
+                    key: root
+                    name: openai-credentials
+              - name: LANGCHAIN_TRACING_V2
+                value: "true"
+              - name: LANGCHAIN_ENDPOINT
+                value: https://api.smith.langchain.com
+              - name: LANGCHAIN_PROJECT
+                value: tchoung-te
+              - name: LANGCHAIN_API_KEY
+                valueFrom:
+                  secretKeyRef:
+                    key: root
+                    name: langchain-credentials
+        service:
+          ports:
+            - port: 8000
+          type: LoadBalancer
+        ingress:
+          tls: true
+          tlsClusterIssuer: letsencrypt-prod
+          ingressClass: traefik
+          rules:
+          - host: ai.mongulu.cm
+
+# This is a list of `dev` containers that are based on the containers created by your deployments
+dev:
+  app:
+    # Search for the container that runs this image
+    imageSelector: mongulu/tchoung-te
+    # Replace the container image with this dev-optimized image (allows to skip image building during development)
+    devImage: python:3.11.7-bullseye
+    # Sync files between the local filesystem and the development container
+    sync:
+      - path: ./:/app
+    # Open a terminal and use the following command to start it
+    terminal:
+      command: ./app/devspace_start.sh
+    # Inject a lightweight SSH server into the container (so your IDE can connect to the remote dev env)
+    ssh:
+      enabled: true
+    # Make the following commands from my local machine available inside the dev container
+    proxyCommands:
+      - command: devspace
+      - command: kubectl
+      - command: helm
+      - gitCredentials: true
+    # Forward the following ports to be able access your application via localhost
+    ports:
+      - port: "8000"
+    # Open the following URLs once they return an HTTP status code other than 502 or 503
+    open:
+      - url: http://localhost:8000
+
+# Use the `commands` section to define repeatable dev workflows for this project 
+commands:
+  migrate-db:
+    command: |-
+      echo 'This is a cross-platform, shared command that can be used to codify any kind of dev task.'
+      echo 'Anyone using this project can invoke it via "devspace run migrate-db"'
+
+# Define dependencies to other projects with a devspace.yaml
+# dependencies:
+#   api:
+#     git: https://...  # Git-based dependencies
+#     tag: v1.0.0
+#   ui:
+#     path: ./ui        # Path-based dependencies (for monorepos)
+
+# To fill , use https://www.devspace.sh/component-chart/docs/configuration
diff --git a/devspace_start.sh b/devspace_start.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+set +e  # Continue on errors
+
+COLOR_BLUE="\033[0;94m"
+COLOR_GREEN="\033[0;92m"
+COLOR_RESET="\033[0m"
+
+# Print useful output for user
+echo -e "${COLOR_BLUE}
+     %########%      
+     %###########%       ____                 _____                      
+         %#########%    |  _ \   ___ __   __ / ___/  ____    ____   ____ ___ 
+         %#########%    | | | | / _ \\\\\ \ / / \___ \ |  _ \  / _  | / __// _ \\
+     %#############%    | |_| |(  __/ \ V /  ____) )| |_) )( (_| |( (__(  __/
+     %#############%    |____/  \___|  \_/   \____/ |  __/  \__,_| \___\\\\\___|
+ %###############%                                  |_|
+ %###########%${COLOR_RESET}
+
+
+Welcome to your development container!
+
+This is how you can work with it:
+- Files will be synchronized between your local machine and this container
+- Some ports will be forwarded, so you can access this container via localhost
+- Run \`${COLOR_GREEN}python main.py${COLOR_RESET}\` to start the application
+"
+
+# Set terminal prompt
+export PS1="\[${COLOR_BLUE}\]devspace\[${COLOR_RESET}\] ./\W \[${COLOR_BLUE}\]\\$\[${COLOR_RESET}\] "
+if [ -z "$BASH" ]; then export PS1="$ "; fi
+
+# Include project's bin/ folder in PATH
+export PATH="./bin:$PATH"
+
+# Open shell
+bash --norc
diff --git a/etl/chainlit.md b/etl/chainlit.md
@@ -0,0 +1,15 @@
+**Mongulu**: Your Interactive Cameroonian Association Directory 🇨🇲
+
+![Mongulu Logo](https://avatars.githubusercontent.com/u/79483730?s=200&v=4)
+
+Mongulu is your go-to directory for connecting with Cameroonian associations in France. It offers a comprehensive list of associations, personalized assistance, easy navigation, seamless communication, and updates from the Cameroonian community.
+
+- **Comprehensive Directory**: Access an up-to-date list of Cameroonian associations.
+- **Personalized Assistance**: Receive tailored recommendations.
+- **Intuitive Navigation**: Easily connect with associations, events, and resources.
+- **Seamless Communication**: Engage through a user-friendly chatbot.
+- **Stay Informed**: Get updates on events and news.
+
+Visit [Facebook](https://www.facebook.com/CollectifMongulu/) for feedback.
+
+Contribute to [Mongulu](https://github.com/mongulu-cm/tchoung-te) and use it freely under the [MIT License](https://github.com/mongulu-cm/tchoung-te/blob/main/LICENSE).
diff --git a/etl/enrich-database.py b/etl/enrich-database.py
@@ -12,15 +12,15 @@
 
 tqdm.pandas()
 pandarallel.initialize(progress_bar=True)
-requests_cache.install_cache('enrich_cache', backend='sqlite')
+requests_cache.install_cache("enrich_cache", backend="sqlite")
 install(show_locals=True)
 
 
 subscription_key = os.environ["BING_SUBSCRIPTION_KEY"]
 search_url = "https://api.bing.microsoft.com/v7.0/search"
 
 # %%
-df = pd.read_csv('ref-rna-real-mars-2022.csv')
+df = pd.read_csv("ref-rna-real-mars-2022.csv")
 
 # %%
 # Plusieurs titres contiennent le nom de l'association et abbreviation entre parenthèses ou pas
@@ -30,35 +30,42 @@
 
 
 def enrich(site, name):
-
     # time.sleep(1)
 
     name = ftfy.fix_text(name)  # enlever les \
     if "(" in name:
         # L'algorithme de schwartz_hearst sépare le texte en 2 parties {"abbreviation" : "texte sans abbreviation"}
         # Cependant il ne fonctionne que si abbreviation est entre parenthèses et après le nom non abrégé.
         # Il ne fonctionne donc pas si abbreviation est avant celui-ci et dans le cas ou il n'y a pas de parenthèses.
-        pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
-            doc_text=name)
+        pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=name)
         # print(pairs)
         if len(pairs) == 1:
             name = list(pairs.values())[0]
 
     # inspired from https://github.com/Azure-Samples/cognitive-services-REST-api-samples/blob/master/python/Search/BingWebSearchv7.py
-    search_term = f'{name} site:{site}'
+    search_term = f"{name} site:{site}"
     headers = {"Ocp-Apim-Subscription-Key": subscription_key}
-    params = {"q": search_term, "textDecorations": True,
-              "textFormat": "HTML", "mkt": "fr-FR"}
+    params = {
+        "q": search_term,
+        "textDecorations": True,
+        "textFormat": "HTML",
+        "mkt": "fr-FR",
+    }
     response = requests.get(search_url, headers=headers, params=params)
     response.raise_for_status()
     search_results = response.json()
 
-    return search_results['webPages']['value'][0]['url'] if 'webPages' in search_results else "not found"
+    return (
+        search_results["webPages"]["value"][0]["url"]
+        if "webPages" in search_results
+        else "not found"
+    )
 
 
 # %%
-df['facebook_url'] = df.parallel_apply(lambda row: enrich("facebook.com",
-                                       row["titre"]), axis=1)
+df["facebook_url"] = df.parallel_apply(
+    lambda row: enrich("facebook.com", row["titre"]), axis=1
+)
 
 # %%
 df["facebook_url"].describe()
@@ -67,8 +74,9 @@ def enrich(site, name):
 df["facebook_url"].head(100)
 
 # %%
-df['helloasso_url'] = df.parallel_apply(lambda row: enrich("helloasso.com",
-                                                           row["titre"]), axis=1)
+df["helloasso_url"] = df.parallel_apply(
+    lambda row: enrich("helloasso.com", row["titre"]), axis=1
+)
 
 # %%
 df["helloasso_url"].describe()

diff --git a/etl/experiments/4.process_data.py b/etl/experiments/4.process_data.py
@@ -2,7 +2,6 @@
 # ## Necessary imports
 
 # %%
-import numpy as np
 import pandas as pd
 
 # %%
@@ -15,7 +14,7 @@ def remove_space_at_the_end(x: str):
 
 def replace_double_quote(x: str):
     if x is not None:
-        return x.replace("\"\"", "'")
+        return x.replace('""', "'")
 
 
 def normalize(data: pd.DataFrame, text_columns):
@@ -36,18 +35,15 @@ def normalize(data: pd.DataFrame, text_columns):
 # %% [markdown]
 # ## Load and viz data
 # %%
-data = pd.read_csv(
-    "../ref-rna-real-mars-2022-enriched-not-qualified.csv", index_col=0)
+data = pd.read_csv("../ref-rna-real-mars-2022-enriched-not-qualified.csv", index_col=0)
 # ignore first column it is index not correctly saved
 data = data[data.columns[1:]]
 
 # %%
 data.info()
 
 # %%
-text_columns = [
-    "titre", "objet", "social_object1_libelle", "social_object2_libelle"
-]
+text_columns = ["titre", "objet", "social_object1_libelle", "social_object2_libelle"]
 
 data = normalize(data, text_columns)
 data.sample(5)
@@ -56,7 +52,7 @@ def normalize(data: pd.DataFrame, text_columns):
 # ## Save without index
 
 # %%
-filename = '../ref-rna-real-mars-2022-enriched-not-qualified-process'
-data.to_csv(f'./{filename}.csv', index=False)
+filename = "../ref-rna-real-mars-2022-enriched-not-qualified-process"
+data.to_csv(f"./{filename}.csv", index=False)
 
 # %%
diff --git a/etl/experiments/ui.py b/etl/experiments/ui.py
@@ -0,0 +1,57 @@
+import os
+
+import chainlit as cl
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chains.conversation.memory import ConversationBufferMemory
+from langchain.chat_models import ChatOpenAI
+from langchain.document_loaders.csv_loader import CSVLoader
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    HumanMessagePromptTemplate,
+    SystemMessagePromptTemplate,
+)
+from langchain.vectorstores import FAISS
+
+system_template = """Vous êtes un assistant IA qui fournit des informations sur les associations camerounaises en France. Vous recevez une question et fournissez une réponse claire et structurée. Lorsque cela est pertinent, utilisez des points et des listes pour structurer vos réponses.
+
+Utilisez les éléments de contexte suivants pour répondre à la question de l'utilisateur. Si vous ne connaissez pas la réponse, dites simplement que vous ne savez pas, n'essayez pas d'inventer une réponse.
+
+Si vous souhaitez connaître le nombre d'associations, je vous recommande de visiter le site web "tchoung-te.mongulu.cm" pour obtenir des informations actualisées à ce sujet.
+----------------
+{context}"""
+messages = [
+    SystemMessagePromptTemplate.from_template(system_template),
+    HumanMessagePromptTemplate.from_template("{question}"),
+]
+CHAT_PROMPT = ChatPromptTemplate.from_messages(messages)
+
+embedding_pth = "embeddings"
+embeddings = OpenAIEmbeddings()
+if os.path.exists(embedding_pth):
+    vectors = FAISS.load_local(embedding_pth, embeddings)
+else:
+    loader = CSVLoader(
+        file_path="ref-rna-real-mars-2022-enriched-qualified.csv", encoding="utf-8"
+    )
+    data = loader.load()
+    vectors = FAISS.from_documents(data, embeddings)
+    vectors.save_local(embedding_pth)
+
+llm = ChatOpenAI(max_tokens=500, temperature=0, model_name="gpt-3.5-turbo")
+chain_type_kwargs = {"prompt": CHAT_PROMPT}
+
+
+memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
+chain = ConversationalRetrievalChain.from_llm(
+    llm=llm,
+    retriever=vectors.as_retriever(search_kwargs={"k": 3}),
+    combine_docs_chain_kwargs=chain_type_kwargs,
+    chain_type="stuff",
+    memory=memory,
+)
+
+
+@cl.langchain_factory(use_async=True)
+def factory():
+    return chain