diff --git a/.gitignore b/.gitignore index 422b9f1..a3d16e1 100644 --- a/.gitignore +++ b/.gitignore @@ -182,3 +182,12 @@ package.json .aws/ ## To avoid commit of .envrc in gitpod because this cause gitpod to execute .envrc instead of gitpod.yml .envrc +etl/experiments/.chainlit/** +etl/.chainlit/** +etl/embeddings/** + +format.* + +# Ignore DevSpace/chainlit cache and log folder +.devspace/ +.chainlit/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..deb6586 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +# Use an official Python runtime as a parent image +FROM python:3.11.7-bullseye + +# Set environment variables +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + +# Set the working directory in the container +WORKDIR /app + +# Install any needed packages specified in requirements.txt +COPY etl/requirements.txt /app/ +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the current directory contents into the container at /app +COPY . /app/ + +# Make port 8000 available to the world outside this container +EXPOSE 8000 + +CMD cd etl && chainlit run experiments/ui.py diff --git a/devspace.yaml b/devspace.yaml new file mode 100755 index 0000000..0ed521f --- /dev/null +++ b/devspace.yaml @@ -0,0 +1,114 @@ +version: v2beta1 +name: tchoung-tegit + +# This is a list of `pipelines` that DevSpace can execute (you can define your own) +pipelines: + # This is the pipeline for the main command: `devspace dev` (or `devspace run-pipeline dev`) + dev: + run: |- + run_dependencies --all # 1. Deploy any projects this project needs (see "dependencies") + ensure_pull_secrets --all # 2. Ensure pull secrets + create_deployments --all # 3. Deploy Helm charts and manifests specfied as "deployments" + start_dev app # 4. Start dev mode "app" (see "dev" section) + # You can run this pipeline via `devspace deploy` (or `devspace run-pipeline deploy`) + deploy: + run: |- + run_dependencies --all # 1. Deploy any projects this project needs (see "dependencies") + ensure_pull_secrets --all # 2. Ensure pull secrets + build_images --all -t $(git describe --always) # 3. Build, tag (git commit hash) and push all images (see "images") + create_deployments --all # 4. Deploy Helm charts and manifests specfied as "deployments" + +# This is a list of `images` that DevSpace can build for this project +# We recommend to skip image building during development (devspace dev) as much as possible +images: + app: + image: mongulu/tchoung-te + dockerfile: ./Dockerfile + +# This is a list of `deployments` that DevSpace can create for this project +deployments: + app: + # This deployment uses `helm` but you can also define `kubectl` deployments or kustomizations + helm: + # We are deploying this project with the Helm chart you provided + chart: + name: component-chart + repo: https://charts.devspace.sh + # Under `values` we can define the values for this Helm chart used during `helm install/upgrade` + # You may also use `valuesFiles` to load values from files, e.g. valuesFiles: ["values.yaml"] + values: + containers: + - image: mongulu/tchoung-te + env: + - name: OPENAI_API_KEY + valueFrom: + secretKeyRef: + key: root + name: openai-credentials + - name: LANGCHAIN_TRACING_V2 + value: "true" + - name: LANGCHAIN_ENDPOINT + value: https://api.smith.langchain.com + - name: LANGCHAIN_PROJECT + value: tchoung-te + - name: LANGCHAIN_API_KEY + valueFrom: + secretKeyRef: + key: root + name: langchain-credentials + service: + ports: + - port: 8000 + type: LoadBalancer + ingress: + tls: true + tlsClusterIssuer: letsencrypt-prod + ingressClass: traefik + rules: + - host: ai.mongulu.cm + +# This is a list of `dev` containers that are based on the containers created by your deployments +dev: + app: + # Search for the container that runs this image + imageSelector: mongulu/tchoung-te + # Replace the container image with this dev-optimized image (allows to skip image building during development) + devImage: python:3.11.7-bullseye + # Sync files between the local filesystem and the development container + sync: + - path: ./:/app + # Open a terminal and use the following command to start it + terminal: + command: ./app/devspace_start.sh + # Inject a lightweight SSH server into the container (so your IDE can connect to the remote dev env) + ssh: + enabled: true + # Make the following commands from my local machine available inside the dev container + proxyCommands: + - command: devspace + - command: kubectl + - command: helm + - gitCredentials: true + # Forward the following ports to be able access your application via localhost + ports: + - port: "8000" + # Open the following URLs once they return an HTTP status code other than 502 or 503 + open: + - url: http://localhost:8000 + +# Use the `commands` section to define repeatable dev workflows for this project +commands: + migrate-db: + command: |- + echo 'This is a cross-platform, shared command that can be used to codify any kind of dev task.' + echo 'Anyone using this project can invoke it via "devspace run migrate-db"' + +# Define dependencies to other projects with a devspace.yaml +# dependencies: +# api: +# git: https://... # Git-based dependencies +# tag: v1.0.0 +# ui: +# path: ./ui # Path-based dependencies (for monorepos) + +# To fill , use https://www.devspace.sh/component-chart/docs/configuration \ No newline at end of file diff --git a/devspace_start.sh b/devspace_start.sh new file mode 100755 index 0000000..d681506 --- /dev/null +++ b/devspace_start.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set +e # Continue on errors + +COLOR_BLUE="\033[0;94m" +COLOR_GREEN="\033[0;92m" +COLOR_RESET="\033[0m" + +# Print useful output for user +echo -e "${COLOR_BLUE} + %########% + %###########% ____ _____ + %#########% | _ \ ___ __ __ / ___/ ____ ____ ____ ___ + %#########% | | | | / _ \\\\\ \ / / \___ \ | _ \ / _ | / __// _ \\ + %#############% | |_| |( __/ \ V / ____) )| |_) )( (_| |( (__( __/ + %#############% |____/ \___| \_/ \____/ | __/ \__,_| \___\\\\\___| + %###############% |_| + %###########%${COLOR_RESET} + + +Welcome to your development container! + +This is how you can work with it: +- Files will be synchronized between your local machine and this container +- Some ports will be forwarded, so you can access this container via localhost +- Run \`${COLOR_GREEN}python main.py${COLOR_RESET}\` to start the application +" + +# Set terminal prompt +export PS1="\[${COLOR_BLUE}\]devspace\[${COLOR_RESET}\] ./\W \[${COLOR_BLUE}\]\\$\[${COLOR_RESET}\] " +if [ -z "$BASH" ]; then export PS1="$ "; fi + +# Include project's bin/ folder in PATH +export PATH="./bin:$PATH" + +# Open shell +bash --norc diff --git a/etl/chainlit.md b/etl/chainlit.md new file mode 100644 index 0000000..878c1ab --- /dev/null +++ b/etl/chainlit.md @@ -0,0 +1,15 @@ +**Mongulu**: Your Interactive Cameroonian Association Directory 🇨🇲 + +![Mongulu Logo](https://avatars.githubusercontent.com/u/79483730?s=200&v=4) + +Mongulu is your go-to directory for connecting with Cameroonian associations in France. It offers a comprehensive list of associations, personalized assistance, easy navigation, seamless communication, and updates from the Cameroonian community. + +- **Comprehensive Directory**: Access an up-to-date list of Cameroonian associations. +- **Personalized Assistance**: Receive tailored recommendations. +- **Intuitive Navigation**: Easily connect with associations, events, and resources. +- **Seamless Communication**: Engage through a user-friendly chatbot. +- **Stay Informed**: Get updates on events and news. + +Visit [Facebook](https://www.facebook.com/CollectifMongulu/) for feedback. + +Contribute to [Mongulu](https://github.com/mongulu-cm/tchoung-te) and use it freely under the [MIT License](https://github.com/mongulu-cm/tchoung-te/blob/main/LICENSE). \ No newline at end of file diff --git a/etl/enrich-database.py b/etl/enrich-database.py index 8cddf52..664807d 100644 --- a/etl/enrich-database.py +++ b/etl/enrich-database.py @@ -12,7 +12,7 @@ tqdm.pandas() pandarallel.initialize(progress_bar=True) -requests_cache.install_cache('enrich_cache', backend='sqlite') +requests_cache.install_cache("enrich_cache", backend="sqlite") install(show_locals=True) @@ -20,7 +20,7 @@ search_url = "https://api.bing.microsoft.com/v7.0/search" # %% -df = pd.read_csv('ref-rna-real-mars-2022.csv') +df = pd.read_csv("ref-rna-real-mars-2022.csv") # %% # Plusieurs titres contiennent le nom de l'association et abbreviation entre parenthèses ou pas @@ -30,7 +30,6 @@ def enrich(site, name): - # time.sleep(1) name = ftfy.fix_text(name) # enlever les \ @@ -38,27 +37,35 @@ def enrich(site, name): # L'algorithme de schwartz_hearst sépare le texte en 2 parties {"abbreviation" : "texte sans abbreviation"} # Cependant il ne fonctionne que si abbreviation est entre parenthèses et après le nom non abrégé. # Il ne fonctionne donc pas si abbreviation est avant celui-ci et dans le cas ou il n'y a pas de parenthèses. - pairs = schwartz_hearst.extract_abbreviation_definition_pairs( - doc_text=name) + pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=name) # print(pairs) if len(pairs) == 1: name = list(pairs.values())[0] # inspired from https://github.com/Azure-Samples/cognitive-services-REST-api-samples/blob/master/python/Search/BingWebSearchv7.py - search_term = f'{name} site:{site}' + search_term = f"{name} site:{site}" headers = {"Ocp-Apim-Subscription-Key": subscription_key} - params = {"q": search_term, "textDecorations": True, - "textFormat": "HTML", "mkt": "fr-FR"} + params = { + "q": search_term, + "textDecorations": True, + "textFormat": "HTML", + "mkt": "fr-FR", + } response = requests.get(search_url, headers=headers, params=params) response.raise_for_status() search_results = response.json() - return search_results['webPages']['value'][0]['url'] if 'webPages' in search_results else "not found" + return ( + search_results["webPages"]["value"][0]["url"] + if "webPages" in search_results + else "not found" + ) # %% -df['facebook_url'] = df.parallel_apply(lambda row: enrich("facebook.com", - row["titre"]), axis=1) +df["facebook_url"] = df.parallel_apply( + lambda row: enrich("facebook.com", row["titre"]), axis=1 +) # %% df["facebook_url"].describe() @@ -67,8 +74,9 @@ def enrich(site, name): df["facebook_url"].head(100) # %% -df['helloasso_url'] = df.parallel_apply(lambda row: enrich("helloasso.com", - row["titre"]), axis=1) +df["helloasso_url"] = df.parallel_apply( + lambda row: enrich("helloasso.com", row["titre"]), axis=1 +) # %% df["helloasso_url"].describe() diff --git a/etl/experiments/4.process_data.py b/etl/experiments/4.process_data.py index f7575ee..c7d07b0 100644 --- a/etl/experiments/4.process_data.py +++ b/etl/experiments/4.process_data.py @@ -2,7 +2,6 @@ # ## Necessary imports # %% -import numpy as np import pandas as pd # %% @@ -15,7 +14,7 @@ def remove_space_at_the_end(x: str): def replace_double_quote(x: str): if x is not None: - return x.replace("\"\"", "'") + return x.replace('""', "'") def normalize(data: pd.DataFrame, text_columns): @@ -36,8 +35,7 @@ def normalize(data: pd.DataFrame, text_columns): # %% [markdown] # ## Load and viz data # %% -data = pd.read_csv( - "../ref-rna-real-mars-2022-enriched-not-qualified.csv", index_col=0) +data = pd.read_csv("../ref-rna-real-mars-2022-enriched-not-qualified.csv", index_col=0) # ignore first column it is index not correctly saved data = data[data.columns[1:]] @@ -45,9 +43,7 @@ def normalize(data: pd.DataFrame, text_columns): data.info() # %% -text_columns = [ - "titre", "objet", "social_object1_libelle", "social_object2_libelle" -] +text_columns = ["titre", "objet", "social_object1_libelle", "social_object2_libelle"] data = normalize(data, text_columns) data.sample(5) @@ -56,7 +52,7 @@ def normalize(data: pd.DataFrame, text_columns): # ## Save without index # %% -filename = '../ref-rna-real-mars-2022-enriched-not-qualified-process' -data.to_csv(f'./{filename}.csv', index=False) +filename = "../ref-rna-real-mars-2022-enriched-not-qualified-process" +data.to_csv(f"./{filename}.csv", index=False) # %% diff --git a/etl/experiments/ui.py b/etl/experiments/ui.py new file mode 100644 index 0000000..9bf84bf --- /dev/null +++ b/etl/experiments/ui.py @@ -0,0 +1,57 @@ +import os + +import chainlit as cl +from langchain.chains import ConversationalRetrievalChain +from langchain.chains.conversation.memory import ConversationBufferMemory +from langchain.chat_models import ChatOpenAI +from langchain.document_loaders.csv_loader import CSVLoader +from langchain.embeddings import OpenAIEmbeddings +from langchain.prompts.chat import ( + ChatPromptTemplate, + HumanMessagePromptTemplate, + SystemMessagePromptTemplate, +) +from langchain.vectorstores import FAISS + +system_template = """Vous êtes un assistant IA qui fournit des informations sur les associations camerounaises en France. Vous recevez une question et fournissez une réponse claire et structurée. Lorsque cela est pertinent, utilisez des points et des listes pour structurer vos réponses. + +Utilisez les éléments de contexte suivants pour répondre à la question de l'utilisateur. Si vous ne connaissez pas la réponse, dites simplement que vous ne savez pas, n'essayez pas d'inventer une réponse. + +Si vous souhaitez connaître le nombre d'associations, je vous recommande de visiter le site web "tchoung-te.mongulu.cm" pour obtenir des informations actualisées à ce sujet. +---------------- +{context}""" +messages = [ + SystemMessagePromptTemplate.from_template(system_template), + HumanMessagePromptTemplate.from_template("{question}"), +] +CHAT_PROMPT = ChatPromptTemplate.from_messages(messages) + +embedding_pth = "embeddings" +embeddings = OpenAIEmbeddings() +if os.path.exists(embedding_pth): + vectors = FAISS.load_local(embedding_pth, embeddings) +else: + loader = CSVLoader( + file_path="ref-rna-real-mars-2022-enriched-qualified.csv", encoding="utf-8" + ) + data = loader.load() + vectors = FAISS.from_documents(data, embeddings) + vectors.save_local(embedding_pth) + +llm = ChatOpenAI(max_tokens=500, temperature=0, model_name="gpt-3.5-turbo") +chain_type_kwargs = {"prompt": CHAT_PROMPT} + + +memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) +chain = ConversationalRetrievalChain.from_llm( + llm=llm, + retriever=vectors.as_retriever(search_kwargs={"k": 3}), + combine_docs_chain_kwargs=chain_type_kwargs, + chain_type="stuff", + memory=memory, +) + + +@cl.langchain_factory(use_async=True) +def factory(): + return chain diff --git a/etl/experiments/update-database/main.py b/etl/experiments/update-database/main.py index 3ffa706..e964485 100644 --- a/etl/experiments/update-database/main.py +++ b/etl/experiments/update-database/main.py @@ -2,32 +2,60 @@ # create a pandas dataframe from a csv file import pandas as pd -df = pd.read_csv('diff.csv', usecols=['id']) -df2 = pd.read_csv('../../ref-rna-real-mars-2022-enriched-not-qualified.csv', ) +df = pd.read_csv("diff.csv", usecols=["id"]) +df2 = pd.read_csv( + "../../ref-rna-real-mars-2022-enriched-not-qualified.csv", +) # %% # join df and df2 on the id column -df3 = df.join(df2.set_index('id'), on='id') +df3 = df.join(df2.set_index("id"), on="id") # drop columns that are not needed -df4 = df3.drop(columns=['Unnamed: 0.1', 'Unnamed: 0','adrs_numvoie','adrs_typevoie','adrs_libvoie', - 'adrs_codepostal','adrs_libcommune','siteweb','latitude','longitude']) +df4 = df3.drop( + columns=[ + "Unnamed: 0.1", + "Unnamed: 0", + "adrs_numvoie", + "adrs_typevoie", + "adrs_libvoie", + "adrs_codepostal", + "adrs_libcommune", + "siteweb", + "latitude", + "longitude", + ] +) df4.to_csv("ref-rna-real-mars-2022-enriched-not-qualified-new-only.csv") # %% -df5 = pd.read_csv('ref-rna-real-mars-2022-enriched-qualified-new-only.csv') -df5 = df5.join(df3.drop(columns=['objet','Unnamed: 0.1', 'Unnamed: 0','adrs', 'dept', 'region', - 'social_object1_libelle', 'titre','facebook_url', 'helloasso_url']) - .set_index('id'), on='id') +df5 = pd.read_csv("ref-rna-real-mars-2022-enriched-qualified-new-only.csv") +df5 = df5.join( + df3.drop( + columns=[ + "objet", + "Unnamed: 0.1", + "Unnamed: 0", + "adrs", + "dept", + "region", + "social_object1_libelle", + "titre", + "facebook_url", + "helloasso_url", + ] + ).set_index("id"), + on="id", +) -df6 = pd.read_csv('../../ref-rna-real-mars-2022-enriched-qualified.csv') -if df5['id'].isin(df6['id']).any(): +df6 = pd.read_csv("../../ref-rna-real-mars-2022-enriched-qualified.csv") +if df5["id"].isin(df6["id"]).any(): print("Dataframes have at least one common value on column 'id'") else: df7 = pd.concat([df5, df6], ignore_index=True) - df7.set_index('id', inplace=True) + df7.set_index("id", inplace=True) """ replace "not found" values with '' so that when not present they are not displayed on Gogocarto """ - df7['helloasso_url'].replace('not found', '', inplace=True) - df7['facebook_url'].replace('not found', '', inplace=True) + df7["helloasso_url"].replace("not found", "", inplace=True) + df7["facebook_url"].replace("not found", "", inplace=True) df7.to_csv("../../ref-rna-real-mars-2022-enriched-qualified.csv") diff --git a/etl/filter-cameroon.py b/etl/filter-cameroon.py index 783f93e..68e1106 100644 --- a/etl/filter-cameroon.py +++ b/etl/filter-cameroon.py @@ -1,27 +1,40 @@ # %% # CSV Files downloaded from https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/ Fichier RNA Waldec du 01 Mars 2022 -import numpy as np -from rich.console import Console +import datetime as dt import glob import os import time -import openai + import boto3 -from diskcache import Cache +import numpy as np +import openai import pandas as pd import requests_cache +from diskcache import Cache from geopy.geocoders import Nominatim -from pandarallel import pandarallel from lambdaprompt import GPT3Prompt -import time -import datetime as dt - +from pandarallel import pandarallel +from rich.console import Console # %% start = time.time() file_location = os.getcwd() + "/rna_waldec_20220301/" all_files = glob.glob(os.path.join(file_location, "*.csv")) +columns = [ + "id", + "titre", + "objet", + "objet_social1", + "objet_social2", + "adrs_numvoie", + "position", + "adrs_typevoie", + "adrs_libvoie", + "adrs_codepostal", + "adrs_libcommune", + "siteweb", +] columns = [ "id", "titre", @@ -37,6 +50,20 @@ "siteweb", ] +df = pd.concat( + ( + pd.read_csv( + f, + delimiter=";", + header=0, + encoding="ISO-8859-1", + usecols=columns, + engine="c", + ) + for f in all_files + ), + ignore_index=True, +) df_associations = pd.concat( [ pd.read_csv( @@ -57,10 +84,13 @@ # %% ssm = boto3.client("ssm", region_name="eu-central-1") +ssm = boto3.client("ssm", region_name="eu-central-1") openai.api_key = ssm.get_parameter( Name="/tchoung-te/openai_api_key", WithDecryption=False )["Parameter"]["Value"] + Name="/tchoung-te/openai_api_key", WithDecryption=False +)["Parameter"]["Value"] # setter la variable d'environnement @@ -72,6 +102,12 @@ def filter_cameroon(df): + return df[ + df["titre"].str.contains("CAMEROUN", case=False, na=False) + | df["objet"].str.contains("CAMEROUN", case=False, na=False) + | df["titre"].str.contains("KMER", case=False, na=False) + | df["objet"].str.contains("KMER", case=False, na=False) + ] """ Filter associations with "Cameroun" in the title or the object """ @@ -89,6 +125,11 @@ def remove_closed(df): def normalize(df): + df["titre"] = df["titre"].str.upper() + df["objet"] = df["objet"].str.lower() + df["adrs_codepostal"] = df["adrs_codepostal"].astype(int) + df["objet_social1"] = df["objet_social1"].astype(int) + df["objet_social2"] = df["objet_social2"].astype(int) """ Normalize strings in the associations infos """ @@ -103,6 +144,25 @@ def normalize(df): return df +def select_relevant_columns(df): + return df[ + [ + "id", + "titre", + "objet", + "objet_social1", + "objet_social2", + "adrs_numvoie", + "adrs_typevoie", + "adrs_libvoie", + "adrs_codepostal", + "adrs_libcommune", + "siteweb", + ] + ] + + +df2 = df.pipe(filter_cameroon).pipe(remove_closed).pipe(normalize) df_cameroon_associations = ( df_associations.pipe(filter_cameroon).pipe(remove_closed).pipe(normalize) ) @@ -190,9 +250,22 @@ def normalize(df): # Build a list of all adresses in the cache & remove useless spaces all_adresses = "\n".join(list(cache)) all_adresses = all_adresses.split("\n") +all_adresses = "\n".join(list(cache)) +all_adresses = all_adresses.split("\n") all_adresses = [x.strip() for x in all_adresses] # Build adresse by concatenation +df2["adrs"] = ( + df2["adrs_numvoie"].map(str) + + " " + + df2["adrs_typevoie"].map(str) + + " " + + df2["adrs_libvoie"].map(str) + + " " + + df2["adrs_codepostal"].map(str) + + " " + + df2["adrs_libcommune"].map(str) +) df_cameroon_associations["adrs"] = ( df_cameroon_associations["adrs_numvoie"].map(str) + " " @@ -274,6 +347,7 @@ def normalize(df): waldec_csv[17035] = "SANTÉ" waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES" + # %% def get_dept_region(code_postal): diff --git a/etl/requirements.txt b/etl/requirements.txt index 79cd7b0..95d8a45 100644 --- a/etl/requirements.txt +++ b/etl/requirements.txt @@ -1,7 +1,6 @@ colorama==0.4.4 findspark==2.0.1 geocoder==1.38.1 -geopandas==0.10.2 geopy==2.2.0 jupyter==1.0.0 jupyter-utils==1.2.6 @@ -18,7 +17,7 @@ cryptography>=3.2 # not directly required, pinned by Snyk to avoid a vulnerabili ipython>=8.10.0 # not directly required, pinned by Snyk to avoid a vulnerability setuptools>=65.5.1 # not directly required, pinned by Snyk to avoid a vulnerability lambdaprompt==0.4.2 -openai==0.27.0 +openai==0.27.8 boto3==1.26.82 diskcache==5.4.0 numpy==1.24.3