-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdigital_project_detector.py
131 lines (97 loc) · 5.55 KB
/
digital_project_detector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Databricks notebook source
# MAGIC %pip install openpyxl nltk
# COMMAND ----------
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, balanced_accuracy_score, roc_auc_score, recall_score
from sklearn.pipeline import make_pipeline
# COMMAND ----------
# MAGIC %md
# MAGIC ## Reading the labeled dataset
# MAGIC
# MAGIC From Clement:
# MAGIC > - For example, one could consider that the list of all Digital Development GP could be a good place to start to define “digital projects”. This is the narrowest definition we have.
# MAGIC > - Alternatively, we have used a keywords-based approach to identify digital projects (the so-called “digital tag”). This is the broadest definition.
# MAGIC > - In the middle, we have a manually collected dataset based on the Digital Economy for Africa initiative (the “DE4A tag”). This is the most accurate but it was collected only for five GPs between FY 2018 and 2023.
# MAGIC
# MAGIC Notes from exploratory analysis:
# MAGIC - Use DE4A as the ground truth since it is the most accurate of the 3 options.
# MAGIC - The DE4A tag does not seem to be geographically limited to Africa.
# MAGIC - The biggest problem with this labeled dataset is that the non-digital (0) class may contain digital projects. These are false negatives in the labeled dataset we are using as ground truth. Given the "garbage in, garbage out" principle, it might be worth going through all projects between 2018 and 2023 and manually review and correct the "DE4A tag" = 0 projects.
# COMMAND ----------
df_raw = pd.read_excel('/Volumes/prd_dap/volumes/dap/data/DigitalDevelopmentOperations/Documents/FY14 lending investments PDF URLs - DD Lead GP, DE4A, and Digital Tags.xlsx')
df_raw['name_objective_abstract'] = df_raw['Project Name'].fillna('') + ' ' \
+ df_raw['Development Objective Description'].fillna('') + ' ' \
+ df_raw['ABSTRACT_TEXT'].fillna('')
df = df_raw[(df_raw.FY >= 2018) & (df_raw.FY <= 2023)] # before 2018 there is no "DE4A Manuel" = 0, after 2023 there are DD Lead GP = 1 while DE4A Manual = 0: df[(df['DD Lead GP'] != df['DE4A Manual']) & (df['DD Lead GP'] == 1)]
df
# COMMAND ----------
by_de4a = df.groupby('DE4A Manual').count()['Project Id']
display(by_de4a)
# COMMAND ----------
df.groupby(['DE4A Manual', 'FY']).count()['Project Id']
# COMMAND ----------
balanced_accuracy_score(df['DE4A Manual'], df['Digital Tag'])
# COMMAND ----------
# MAGIC %md
# MAGIC ## Model specification & training
# COMMAND ----------
# Model specification
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
classifier = LogisticRegression(solver="liblinear", class_weight='balanced', C=0.001, random_state=42)
# classifier = MultinomialNB(alpha=0.1)
model = make_pipeline(vectorizer, classifier)
X = df['name_objective_abstract']
y = df['DE4A Manual']
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Cross-validate with balanced accuracy
balanced_accuracy = make_scorer(balanced_accuracy_score)
recall = make_scorer(recall_score)
cv_scores_accuracy = cross_val_score(model, X_train, y_train, cv=8, scoring=balanced_accuracy)
cv_scores_recall = cross_val_score(model, X_train, y_train, cv=8, scoring=recall)
mean_cv_accuracy = cv_scores_accuracy.mean()
print("Cross-validated Balanced Accuracy: {:.2f}".format(mean_cv_accuracy),
"Sensitivity: {:.2f}".format(cv_scores_recall.mean()))
# Train the model on the full training set, predict then evaluate
model.fit(X_train, y_train)
test_predictions = model.predict(X_test)
test_balanced_accuracy = balanced_accuracy_score(y_test, test_predictions)
test_recall = recall_score(y_test, test_predictions)
print("Test Balanced Accuracy: {:.2f}".format(test_balanced_accuracy),
"Sensitivity: {:.2f}".format(test_recall))
# COMMAND ----------
assert mean_cv_accuracy > 0.85, f"Expect {mean_cv_accuracy} to be over 85%, but got {mean_cv_accuracy}"
# COMMAND ----------
# Omitting hyper param tuning for faster traning pipeline
# from sklearn.model_selection import GridSearchCV
# # Define the parameter grid
# param_grid = {
# # 'tfidfvectorizer__max_features': [500, 1000],
# 'tfidfvectorizer__ngram_range': [(1, 1), (1, 2)],
# # 'logisticregression__penalty': ['l1', 'l2'],
# 'logisticregression__C': [0.001, 0.01, 0.1, 1.0],
# # 'multinomialnb__alpha': [0.1, 0.5, 1.0],
# }
# grid_search = GridSearchCV(model, param_grid, cv=8, scoring=balanced_accuracy)
# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)
# # Print the best parameters & accuracy score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Balanced Accuracy:", grid_search.best_score_)
# COMMAND ----------
# Experiemented with adding feature selection after TFIDF didn't improve performance
# Experiemented with adding preprocessing (lowercasing, punctuation handling & stemming) also didn't improve performance
# Experiemented MuntinominalNB instead of LogisticRegression also didn't improve performance
# pipeline = TFIDF + a simple classifer tops at Cross-validated Balanced Accuracy: 0.86 Sensitivity: 0.86
# COMMAND ----------
from pathlib import Path
import joblib
MODEL_DIR = '/Volumes/prd_dap/volumes/dap/data/DigitalDevelopmentOperations/models'
Path(MODEL_DIR).mkdir(parents=True, exist_ok=True)
joblib.dump(model, f'{MODEL_DIR}/tfidf_logit.pkl', compress = 1)