Skip to content

Commit

Permalink
final submission
Browse files Browse the repository at this point in the history
  • Loading branch information
apiraccini committed Jun 1, 2024
1 parent c43ff84 commit e4776f2
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 36 deletions.
2 changes: 0 additions & 2 deletions metrics.csv

This file was deleted.

Binary file modified model.joblib
Binary file not shown.
24 changes: 5 additions & 19 deletions submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,6 @@ def clean_df(df, background_df=None):

# Health
'ch20m219', # gynaecologist

# Background
'migration_background_bg',
'age_bg',
]

features_background_impute = [
Expand All @@ -114,23 +110,13 @@ def clean_df(df, background_df=None):
'cf20m025', # Living with partner
'cf20m030', # Are you maried
'cf20m402', # Same partner
'cf20m032', # gender partner
'cf20m166', # How satisfied with situation as single

'cf20m471', # Children passed away

# Politics and Values
'cv20l103', # Overall satisfaction
'cv20l125', # Marriage and children
'cv20l126', # One parent vs two
'cv20l130', # Divorce normalcy

# Health
'ch20m219', # gynaecologist

# Background
'migration_background_bg',
'age_bg',
'belbezig_2020', # occupation
'brutoink_f_2020', # gross income
'nettoink_f_2020', # net income
Expand Down Expand Up @@ -160,7 +146,7 @@ def clean_df(df, background_df=None):
'cf20m470', # Fifteenth child's birth year
]

# imputation
# imputation
codebook_df = pd.read_csv('PreFer_codebook.csv', low_memory=False)
df_impute_noback = pd.merge(df[['nomem_encr']], inpute_na(df, feature_2020_impute, codebook_df, method = ''), left_index=True, right_index=True)
df_impute_back = pd.merge(df[['nomem_encr']], inpute_na(df, features_background_impute, codebook_df), left_index=True, right_index=True)
Expand All @@ -175,6 +161,7 @@ def clean_df(df, background_df=None):
for c in df_impute.columns:
if c != 'nomem_encr':
df_impute.rename(columns={c: f'{c}_imputed'}, inplace=True)

df_new = pd.merge(df_impute, df[feature_2020_notimpute+['nomem_encr']], on = 'nomem_encr', how = 'inner')

feature_super_gold = [
Expand All @@ -191,12 +178,11 @@ def clean_df(df, background_df=None):
'cf11d130', # 2011
'cf09b130', # 2009
'cf08a130', # 2008
]
]

df_zero = imputation_cf20_130(feature_super_gold, train_df=df)
df_negative = imputation_cf20_130_negative(feature_super_gold, train_df=df)
df_super_gold_imputed = pd.merge(df_zero, df_negative, on = 'nomem_encr', how = 'inner')

df2 = pd.merge(df_super_gold_imputed, df_new, on = 'nomem_encr', how = 'inner')

# process background df
Expand All @@ -217,7 +203,7 @@ def clean_df(df, background_df=None):
df2['cf20m032_imputed'] = df2['cf20m032_imputed'].replace({1: 'male', 2: 'female'})
df3 = pd.merge(df2, background_df_processed['gender_ds'], on = 'nomem_encr', how='left')
df3['same_sex_ds'] = df3['cf20m032_imputed'] == df3['gender_ds']
df3.drop(columns=['cf20m032', 'cf20m032_imputed', 'gender_ds'], inplace=True)
df3.drop(columns=['cf20m032_imputed', 'gender_ds'], inplace=True)

# big five
bigfive_df = personality_bigfive(train_df=df)
Expand All @@ -234,7 +220,7 @@ def clean_df(df, background_df=None):

features = df.columns.tolist()
df = df[features]

return df


Expand Down
25 changes: 10 additions & 15 deletions training.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
features = [c for c in model_df.columns if c not in ['nomem_encr', 'new_child']]
cat_features = [col for col in features if col.endswith('_ds')]
num_features = [col for col in features if not col.endswith('_ds')]

for c in cat_features:
model_df[c] = model_df[c].astype('category')

numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
Expand All @@ -51,14 +54,6 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
('num', numeric_transformer, num_features),
('cat', categorical_transformer, cat_features)])

params_catboost = {
'iterations': 1000,
'learning_rate': 0.04465495649788828,
'depth': 7,
'subsample': 0.5990716998946282,
'colsample_bylevel': 0.15856264300042117,
'min_data_in_leaf': 48
}
params_lgb = {
'bagging_fraction': 0.8,
'feature_fraction': 0.9,
Expand All @@ -73,18 +68,18 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
}

models = {
'et': Pipeline(steps=[('preprocessor', preprocessor), ('clf', ExtraTreesClassifier(n_estimators=500, max_features=0.3, random_state=42))]),
'cb': CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42, **params_catboost),
'cb': CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42),
'et': Pipeline(steps=[('preprocessor', preprocessor), ('clf', ExtraTreesClassifier(n_estimators=500, max_features=0.3, class_weight='balanced', random_state=42))]),
'lgb': LGBMClassifier(boosting_type = 'gbdt', random_state=42, verbose=-1, class_weight = 'balanced', **params_lgb)
}

# fit the model
models['et'].fit(model_df[features], model_df['new_child'])
models['cb'].fit(model_df[features], model_df['new_child'])
models['et'].fit(model_df[features], model_df['new_child'])
models['lgb'].fit(model_df[features], model_df['new_child'], categorical_feature=cat_features)

# save the model and params
joblib.dump(models, Path(__file__).parent / f"model.joblib")
joblib.dump(models, f"model.joblib")

if evaluate == True:

Expand All @@ -100,13 +95,13 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
y_train, y_test = model_df.iloc[train_index]['new_child'], model_df.iloc[test_index]['new_child']

# fit models
models['et'].fit(X_train, y_train)
models['cb'].fit(X_train, y_train)
models['et'].fit(X_train, y_train)
models['lgb'].fit(X_train, y_train, categorical_feature=cat_features)

# predictions
et_preds = models['et'].predict_proba(X_test)[:, 1]
cb_preds = models['cb'].predict_proba(X_test)[:, 1]
et_preds = models['et'].predict_proba(X_test)[:, 1]
lgb_preds = models['lgb'].predict_proba(X_test)[:, 1]

# average prediction for class 1
Expand Down Expand Up @@ -139,7 +134,7 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
'f1_score': [f1]})

# save results
results_df.to_csv(Path(__file__).parent / 'metrics.csv', index=False)
# results_df.to_csv('metrics.csv', index=False)

# print cv metrics
print('CV metrics:')
Expand Down

0 comments on commit e4776f2

Please sign in to comment.