Skip to content

Commit

Permalink
perf: optimization of Airflow tasks with parallelism and PostGreSQL
Browse files Browse the repository at this point in the history
  • Loading branch information
Francesco Stablum committed Nov 10, 2021
1 parent 20193de commit 97b6e4c
Show file tree
Hide file tree
Showing 10 changed files with 42 additions and 5 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
config/
config/*.yaml
!config/example.yaml

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
2 changes: 1 addition & 1 deletion airflow/airflow.cfg.m4
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ executor = LocalExecutor
# SqlAlchemy supports many different database engines.
# More information here:
# http://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri
sql_alchemy_conn = postgresql+psycopg2://scot:tiger@localhost:5432/airflow
sql_alchemy_conn = postgresql+psycopg2://airflow_user:PG_PASSWORD@localhost:5432/airflow_db

# The encoding for the databases
sql_engine_encoding = utf-8
Expand Down
19 changes: 17 additions & 2 deletions airflow/install_local_airflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,26 @@

SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
LEARNING_SETS_DIR="$(readlink -f $SCRIPT_DIR/..)"

# FIXME: automate conf item extraction?
PG_PASSWORD="$(bash $LEARNING_SETS_DIR/config/get_conf_item.sh pg_password)"
AIRFLOW_USER="$(bash $LEARNING_SETS_DIR/config/get_conf_item.sh airflow_user)"
AIRFLOW_PASSWORD="$(bash $LEARNING_SETS_DIR/config/get_conf_item.sh airflow_password)"
AIRFLOW_EMAIL="$(bash $LEARNING_SETS_DIR/config/get_conf_item.sh airflow_email)"

mkdir -pv $HOME/airflow
mkdir -pv $HOME/airflow/dags

# will macroexpand HOME with the user's home directory
m4 -DHOME=$HOME airflow.cfg.m4 > $HOME/airflow/airflow.cfg
m4 -DHOME=$HOME \
-DPG_PASSWORD=$PG_PASSWORD \
airflow.cfg.m4 > $HOME/airflow/airflow.cfg

# will set the learning_sets dir (extracted from this script's path) to the module that will add learning_sets' dag
m4 -DLEARNING_SETS_DIR=$LEARNING_SETS_DIR add_dag_bags.py.m4 > $HOME/airflow/dags/add_dag_bags.py
m4 -DLEARNING_SETS_DIR=$LEARNING_SETS_DIR \
add_dag_bags.py.m4 > $HOME/airflow/dags/add_dag_bags.py

airflow db init

airflow users create -u $AIRFLOW_USER -e "$AIRFLOW_EMAIL" -r Admin -f $AIRFLOW_USER -l X -p "$AIRFLOW_PASSWORD"

3 changes: 3 additions & 0 deletions airflow/psql_commands.m4
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
CREATE DATABASE airflow_db;
CREATE USER airflow_user WITH PASSWORD 'AIRFLOW_PASSWORD';
GRANT ALL PRIVILEGES ON DATABASE airflow_db TO airflow_user;
2 changes: 2 additions & 0 deletions apt_packages.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
libpq-dev
postgresql
9 changes: 9 additions & 0 deletions config/example.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# suggestion on how to generate a password:
# tr -dc A-Za-z0-9 </dev/urandom | head -c 13 ; echo ''
# https://unix.stackexchange.com/questions/230673/how-to-generate-a-random-string

mongo_user: someuser
mongo_password: somepassword
mongo_host: somehost
Expand All @@ -6,3 +10,8 @@ mongo_db: learning_sets

vm_user: someuser
vm_host: somehost

pg_password: somepassword
airflow_user: someuser
airflow_password: somepassword
airflow_email: [email protected]
6 changes: 6 additions & 0 deletions config/get_conf_item.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
LEARNING_SETS_DIR="$(readlink -f $SCRIPT_DIR/..)"
cd $LEARNING_SETS_DIR

python3 -c "from common import config ; print(config.${1})" | tail -n1
File renamed without changes.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ gensim
nltk
airflow
torchvision
psycopg2
2 changes: 1 addition & 1 deletion rsync.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
VM_URI=$(bash vm_uri.sh)
VM_URI=$(bash config/vm_uri.sh)
bash mongo_dump.sh
rsync -ruv --exclude mlruns . $VM_URI:~/learning_sets/

0 comments on commit 97b6e4c

Please sign in to comment.