diff --git a/.gitignore b/.gitignore index c8b92581..a5b84e97 100644 --- a/.gitignore +++ b/.gitignore @@ -51,6 +51,9 @@ venv.bak/ # Jupytext: we version converted md files not ipynb sources *.ipynb +# Jupyter +.ipynb_checkpoints/ + # -- Provisioning provisioning/.terraform* provisioning/terraform.tfstate* diff --git a/docker-compose.yml b/docker-compose.yml index fc1b53d0..ee7f7895 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -80,6 +80,8 @@ services: NB_GID: ${DOCKER_GID:-1000} CHOWN_HOME: 'yes' CHOWN_HOME_OPTS: -R + env_file: + - env.d/notebook ports: - 8888:8888 volumes: diff --git a/env.d/notebook b/env.d/notebook new file mode 100644 index 00000000..f67cbce7 --- /dev/null +++ b/env.d/notebook @@ -0,0 +1 @@ +DATABASE_URL=postgresql+psycopg://qualicharge:pass@postgresql:5432/qualicharge-api diff --git a/src/notebook/Dockerfile b/src/notebook/Dockerfile index e6f1091d..b00926c6 100644 --- a/src/notebook/Dockerfile +++ b/src/notebook/Dockerfile @@ -1,11 +1,15 @@ # -- Custom image -- -FROM jupyter/base-notebook +FROM quay.io/jupyter/base-notebook:notebook-7.2.1 # Install base dependencies +# +# FIXME: jupytext 1.16.4+ seems to fix the issue but is not released yet +# see: https://github.com/mwouts/jupytext/issues/1260 RUN mamba install --yes \ duckdb \ geopandas \ - jupytext \ + jupytext==1.16.2 \ matplotlib \ pandas \ + psycopg[binary,pool] \ seaborn diff --git a/src/notebook/example.md b/src/notebook/example.md new file mode 100644 index 00000000..1310479b --- /dev/null +++ b/src/notebook/example.md @@ -0,0 +1,145 @@ +--- +jupyter: + jupytext: + formats: ipynb,md + text_representation: + extension: .md + format_name: markdown + format_version: '1.3' + jupytext_version: 1.16.2 + kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +# QualiCharge data: an example notebook + +This notebook aims to be an example notebook used as a starting point for a new analysis or indicator calculation. It provides code snippets and examples to fetch and record data from our PostgreSQL database. + +## Create the database engine + +```python +import os +from sqlalchemy import create_engine + +# Get database URL from the environment +database_url = os.getenv("DATABASE_URL") + +# Create a database engine that will be used to generate connections +engine = create_engine(database_url) +``` + +## Fetch data from the database + +### Example 1: generate a stations map using GeoPandas + +```python +from geopandas import GeoDataFrame + +query = """ +SELECT + Station.nom_station, + Station.id_station_itinerance, + Amenageur.nom_amenageur as amenageur, + Localisation."coordonneesXY" as geom +FROM + Station +INNER JOIN Localisation ON Station.localisation_id = Localisation.id +INNER JOIN Amenageur ON Station.amenageur_id = Amenageur.id +""" + + +with engine.connect() as conn: + # Query a PostgreSQL database using the PostGIS extension + stations = GeoDataFrame.from_postgis(query, conn) + +print(f"Loaded {len(stations.index)} stations") +stations.sample(10) +``` + +```python +# Display an interactive map of the stations +stations.explore(column="amenageur") +``` + +### Example 2: explore operators distribution + +```python +import pandas as pd + +query = """ +SELECT + Operateur.nom_operateur, + PointDeCharge.id_pdc_itinerance +FROM + PointDeCharge + INNER JOIN Station ON PointDeCharge.station_id = Station.id + INNER JOIN Operateur ON Station.operateur_id = Operateur.id +""" + +with engine.connect() as conn: + # Query a PostgreSQL database using the PostGIS extension + pdcs = pd.read_sql_query(query, conn) + +print(f"Loaded {len(pdcs.index)} points of charge") +pdcs.sample(10) +``` + +```python +import seaborn as sns + +# Render a barplot with the number of points of charge by operator +sns.barplot(data=pdcs.value_counts("nom_operateur")) +``` + +## Write data to the database + +### Example 1: create a new table with calculated indicator + +In this example, we will calculate the number of points of charge per French department at a particular date/time (now) and store this stateful snapshot in the database. + +```python +import uuid +import pandas as pd + +# Get the city code insee for each point of charge +query = """ +SELECT + Localisation.code_insee_commune +FROM + PointDeCharge + INNER JOIN Station ON PointDeCharge.station_id = Station.id + INNER JOIN Localisation ON Station.localisation_id = Localisation.id +""" +with engine.connect() as conn: + # Query a PostgreSQL database using the PostGIS extension + codes_insee = pd.read_sql_query(query, conn) + +# Add a department column +codes_insee["department"] = codes_insee["code_insee_commune"].map(lambda x: int(x[:2]) if x else None) + +# Calculate our indicator and add a timestamp to each department counts (row) +indicator = codes_insee.value_counts("department").to_frame().reset_index() +indicator["calculated_at"] = pd.Timestamp.now() + +# Set UUIDs as the index +indicator["uuid"] = indicator.apply(lambda _: uuid.uuid4(), axis=1) +indicator.set_index("uuid", inplace=True) + +# Explictly set the department column as integers +indicator = indicator.astype({"department": "int32"}) +indicator +``` + +```python +# Save the indicator to a (new) table +indicator.to_sql("IDepartmentDynamic", engine, if_exists="append") +``` + +```python +# Check inserted results +query = 'SELECT * FROM "IDepartmentDynamic" WHERE department = 75' +paris = pd.read_sql_query(query, engine) +paris +```