-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_pipeline.py
73 lines (60 loc) · 2.21 KB
/
data_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import json
import requests
import os
from datetime import datetime, timedelta
import airflow
from airflow import DAG
from airflow.operators.python import PythonOperator
from elasticsearch import Elasticsearch, helpers
# load tasks
from dag_tasks.fetch_tweets import fetch_tweets
from dag_tasks.inference_tweets import inference_tweets
from dag_tasks.insert_sentiments_postgresql import insert_sentiments_postgresql
from dag_tasks.delete_json_tweets import delete_json_files
# PARAMETERS
PATH_TO_DIR = os.path.dirname(os.path.abspath(__file__))
PATH_TO_DATASET_DIR = os.path.join(PATH_TO_DIR, "datasets")
# ARGUEMENTS FOR TASKS: fetch_tweets
path_to_data_source = os.path.join(PATH_TO_DATASET_DIR, "combined", "unlabelled", "combined_unlabelled.csv")
search_parameters = {
'company_request_id': 'ZDFSDF',
'keyword': 'iphone',
'n_tweets': 30
}
path_to_dataset_dir = PATH_TO_DATASET_DIR
# ARGUEMENTS FOR TASKS: inference_tweets
path_to_model = os.path.join(PATH_TO_DIR, "model", "sentiment_analysis")
# ARGUEMENTS FOR TASKS: insert_sentiments_postgresql
company_request_id = search_parameters['company_request_id']
# DAG
with DAG(
dag_id='twitter_data_pipeline',
start_date=airflow.utils.dates.days_ago(1),
schedule_interval=timedelta(minutes=5),
catchup=False
) as dag:
fetch_tweets_task = PythonOperator(
task_id='fetch_tweets',
python_callable=fetch_tweets,
op_args=[path_to_data_source, search_parameters, path_to_dataset_dir],
dag=dag
)
inference_tweets_task = PythonOperator(
task_id='inference_tweets',
python_callable=inference_tweets,
op_args=[path_to_model, path_to_dataset_dir],
dag=dag
)
insert_sentiments_postgresql_task = PythonOperator(
task_id='insert_sentiments_postgresql',
python_callable=insert_sentiments_postgresql,
op_args=[PATH_TO_DIR, PATH_TO_DATASET_DIR, company_request_id],
dag=dag
)
delete_json_files_task = PythonOperator(
task_id='delete_json_files',
python_callable=delete_json_files,
op_args=[PATH_TO_DATASET_DIR],
dag=dag
)
fetch_tweets_task >> inference_tweets_task >> insert_sentiments_postgresql_task >> delete_json_files_task