final submission

apiraccini · Jun 1, 2024 · e4776f2 · e4776f2
1 parent c43ff84
commit e4776f2
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 36 deletions.
diff --git a/metrics.csv b/metrics.csv
diff --git a/model.joblib b/model.joblib
diff --git a/submission.py b/submission.py
@@ -85,10 +85,6 @@ def clean_df(df, background_df=None):
 
         # Health
         'ch20m219', # gynaecologist
-
-        # Background
-        'migration_background_bg',
-        'age_bg',
     ]
 
     features_background_impute = [        
@@ -114,23 +110,13 @@ def clean_df(df, background_df=None):
         'cf20m025', # Living with partner
         'cf20m030', # Are you maried
         'cf20m402', # Same partner
-        'cf20m032', # gender partner
         'cf20m166', # How satisfied with situation as single
 
-        'cf20m471', # Children passed away
-
-        # Politics and Values 
-        'cv20l103', # Overall satisfaction
-        'cv20l125', # Marriage and children
-        'cv20l126', # One parent vs two
-        'cv20l130', # Divorce normalcy
-
         # Health
         'ch20m219', # gynaecologist
 
         # Background
         'migration_background_bg',
-        'age_bg',
         'belbezig_2020', # occupation
         'brutoink_f_2020', # gross income
         'nettoink_f_2020', # net income
@@ -160,7 +146,7 @@ def clean_df(df, background_df=None):
         'cf20m470', # Fifteenth child's birth year
         ]
 
-    # imputation
+    # imputation 
     codebook_df = pd.read_csv('PreFer_codebook.csv', low_memory=False)
     df_impute_noback = pd.merge(df[['nomem_encr']], inpute_na(df, feature_2020_impute, codebook_df, method = ''), left_index=True, right_index=True)
     df_impute_back = pd.merge(df[['nomem_encr']], inpute_na(df, features_background_impute, codebook_df), left_index=True, right_index=True)
@@ -175,6 +161,7 @@ def clean_df(df, background_df=None):
     for c in df_impute.columns:
         if c != 'nomem_encr':
             df_impute.rename(columns={c: f'{c}_imputed'}, inplace=True)
+
     df_new = pd.merge(df_impute, df[feature_2020_notimpute+['nomem_encr']],  on = 'nomem_encr', how = 'inner')
 
     feature_super_gold  =  [    
@@ -191,12 +178,11 @@ def clean_df(df, background_df=None):
         'cf11d130', # 2011
         'cf09b130', # 2009
         'cf08a130', # 2008
-        ] 
+        ]   
 
     df_zero = imputation_cf20_130(feature_super_gold, train_df=df)
     df_negative = imputation_cf20_130_negative(feature_super_gold, train_df=df)
     df_super_gold_imputed = pd.merge(df_zero, df_negative, on = 'nomem_encr', how = 'inner')
-
     df2 = pd.merge(df_super_gold_imputed, df_new, on = 'nomem_encr', how = 'inner')
 
     # process background df
@@ -217,7 +203,7 @@ def clean_df(df, background_df=None):
     df2['cf20m032_imputed'] = df2['cf20m032_imputed'].replace({1: 'male', 2: 'female'})
     df3 = pd.merge(df2, background_df_processed['gender_ds'], on = 'nomem_encr', how='left')
     df3['same_sex_ds'] = df3['cf20m032_imputed'] == df3['gender_ds']
-    df3.drop(columns=['cf20m032', 'cf20m032_imputed', 'gender_ds'], inplace=True)
+    df3.drop(columns=['cf20m032_imputed', 'gender_ds'], inplace=True)
 
     # big five
     bigfive_df = personality_bigfive(train_df=df)
@@ -234,7 +220,7 @@ def clean_df(df, background_df=None):
 
     features = df.columns.tolist()
     df = df[features]
-
+    
     return df
 
 

diff --git a/training.py b/training.py
@@ -39,6 +39,9 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
     features = [c for c in model_df.columns if c not in ['nomem_encr', 'new_child']]
     cat_features = [col for col in features if col.endswith('_ds')]
     num_features = [col for col in features if not col.endswith('_ds')]
+
+    for c in cat_features:
+        model_df[c] = model_df[c].astype('category')
 
     numeric_transformer = Pipeline(steps=[
         ('imputer', SimpleImputer(strategy='median')),
@@ -51,14 +54,6 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
             ('num', numeric_transformer, num_features),
             ('cat', categorical_transformer, cat_features)])
 
-    params_catboost = {
-        'iterations': 1000,
-        'learning_rate': 0.04465495649788828,
-        'depth': 7,
-        'subsample': 0.5990716998946282, 
-        'colsample_bylevel': 0.15856264300042117, 
-        'min_data_in_leaf': 48
-        }
     params_lgb = {
         'bagging_fraction': 0.8, 
         'feature_fraction': 0.9, 
@@ -73,18 +68,18 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
         }
 
     models = {
-        'et': Pipeline(steps=[('preprocessor', preprocessor), ('clf', ExtraTreesClassifier(n_estimators=500, max_features=0.3, random_state=42))]),
-        'cb': CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42, **params_catboost),
+        'cb': CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42),
+        'et': Pipeline(steps=[('preprocessor', preprocessor), ('clf', ExtraTreesClassifier(n_estimators=500, max_features=0.3, class_weight='balanced', random_state=42))]),
         'lgb':  LGBMClassifier(boosting_type = 'gbdt', random_state=42, verbose=-1, class_weight = 'balanced', **params_lgb)
     }
 
     # fit the model
-    models['et'].fit(model_df[features], model_df['new_child'])
     models['cb'].fit(model_df[features], model_df['new_child'])
+    models['et'].fit(model_df[features], model_df['new_child'])
     models['lgb'].fit(model_df[features], model_df['new_child'], categorical_feature=cat_features)
 
     # save the model and params
-    joblib.dump(models, Path(__file__).parent / f"model.joblib")
+    joblib.dump(models, f"model.joblib")
 
     if evaluate == True:
 
@@ -100,13 +95,13 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
             y_train, y_test = model_df.iloc[train_index]['new_child'], model_df.iloc[test_index]['new_child']
 
             # fit models
-            models['et'].fit(X_train, y_train)
             models['cb'].fit(X_train, y_train)
+            models['et'].fit(X_train, y_train)
             models['lgb'].fit(X_train, y_train, categorical_feature=cat_features)
 
             # predictions
-            et_preds = models['et'].predict_proba(X_test)[:, 1]
             cb_preds = models['cb'].predict_proba(X_test)[:, 1]
+            et_preds = models['et'].predict_proba(X_test)[:, 1]
             lgb_preds = models['lgb'].predict_proba(X_test)[:, 1]
 
             # average prediction for class 1
@@ -139,7 +134,7 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
             'f1_score': [f1]})
 
         # save results
-        results_df.to_csv(Path(__file__).parent / 'metrics.csv', index=False)
+        # results_df.to_csv('metrics.csv', index=False)
 
         # print cv metrics
         print('CV metrics:')