diff --git a/metrics.csv b/metrics.csv
index 6452a25..e79f83b 100644
--- a/metrics.csv
+++ b/metrics.csv
@@ -1,2 +1,2 @@
 model_name,accuracy,precision,recall,f1_score
-prefer,0.8997026098548941,0.7420464339581987,0.8212624584717607,0.7777125296244101
+prefer,0.9117845117845118,0.8657067273172225,0.7,0.7731426866531241
diff --git a/model.joblib b/model.joblib
index e3d20b9..024069d 100644
Binary files a/model.joblib and b/model.joblib differ
diff --git a/submission.py b/submission.py
index f04c6d7..9a7626d 100644
--- a/submission.py
+++ b/submission.py
@@ -11,127 +11,234 @@ def clean_df(df, background_df=None):
 
     id_df = df[['nomem_encr']]
 
-    # add golden features
-    golden_features = [
-
-        # within how many years do you hope to have your first child 2014-2020
-        'cf20m130', # num
-        'cf19l130',
-        'cf18k130',
-        'cf17j130',
-        'cf14g130',
-        'cf15h130',
-        'cf16i130',
-
-        # relationship
-        'cf20m031', # num # in what year did you marry 2016-2020
-        'cf19l031',
-        'cf18k031',
-        'cf17j031',
-        'cf20m028', # num # when did the relationship start 2020
-        'cf17j028', # num # when did the relationship start 2017
-        'cf20m029', # num # in what year did you start living together 2020
-        'cf19l029', # num # in what year did you start living together 2019
-        'cf08a178', # num # partner with previous partners? 2008
-        'cf08a179', # num # partner with previous partners? 2008
-        'cf10c166', # num # how satisfied with situation as single 2010
-        'cf11d186', # num # relationship issues due to working too much 2010
-
-        # children
-        'cf20m129', # num # how many more children 2020
-        'cf20m456', # num # birth year first child 2020
-        'cf20m457', # num # birth year second child 2020
-        'cf20m458', # num # birth year third child 2020
-        'cf19l472', # num # children passed away 2019
-
-        # childcare - behaviour
-        'cf20m203', # cat # getting up from bed at night 2020
-        'cf20m204', # cat # washing the child 2020
-        'cf20m202', # cat # changing diapers 2020
-        'cf20m251', # num # childcare monthly expense 2020
-
-        # childcare - childsitter
-        'cf17j398', # num # distance from parents
-        'cf20m245', # cat # childsitter yes/no 2020
-        'cf20m248', # cat # who is the childsitter 2020
-        'cf20m249', # num # how many days per week childsitter
-        'cf20m387', # num # childcare supplement
-        'cf20m384', # num # subsidies other than child suppleent
-
-        # property
-        'cd20m024', # num # property value 2020
-        'ca18f023', # num # car value 2018
-        'cd20m083', # num # amount mortgage remaining 2020
-
-        # parents
-        'cf20m005', # num # year of birth father 2020
-        'cf20m009', # num # year of birth mother 2020
+    feature_2020_impute = [
+
+        # Family & Household
+        'cf20m130', # within how many years do you hope to have your first child
+        'cf20m129', # How many more children
+        'cf20m128', # Do you want another child ?
         
-        #gynaecologist
-        'ch20m219',
+        'cf20m005', # Year of birth father 
+        'cf20m008', # When did yout father pass away # 
+        'cf20m009', # Year of birth mum #
+        'cf20m012', # When did yuor mum pass away #  
+        'cf20m014', # How old were you when your parents separated
+        'cf20m398', # Distance from parents
+
+        'cf20m025', # Living with partner
+        'cf20m028', # Relationship start year
+        'cf20m029', # Living together start year
+        'cf20m030', # Are you maried
+        'cf20m031', # In what year did you marry
+        'cf20m402', # Same partner
+        'cf20m032', # gender partner
+        'cf20m166', # How satisfied with situation as single
+        'cf20m185', # Partner disagreement frequency
+        'cf20m186', # Relationship issues due to working too much
+
+        'cf20m250', # Childcare usage
+        'cf20m251', # Childcare monthly expense
         
-        # question about father and mother
-        'cf20m007',
-        'cf20m008',
-        'cf20m011',
-        'cf20m012',
-        'cf20m013',
-        'cf20m014',
-        'cf20m015',
-        'cf20m016'
+        'cf20m456', # First child's birth year
+        'cf20m457', # Second child's birth year
+        'cf20m458', # Third child's birth year
+        'cf20m459', # Fourth child's birth year
+        'cf20m460', # Fifth child's birth year
+        'cf20m461', # Sixth child's birth year
+        'cf20m462', # Seventh child's birth year
+        'cf20m463', # Eighth child's birth year
+        'cf20m464', # Ninth child's birth year
+        'cf20m465', # Tenth child's birth year
+        'cf20m466', # Eleventh child's birth year
+        'cf20m467', # Twelfth child's birth year
+        'cf20m468', # Thirteenth child's birth year
+        'cf20m469', # Fourteenth child's birth year
+        'cf20m470', # Fifteenth child's birth year
+
+        'cf20m471', # Children passed away
+        'cf20m472', # What age Children passed away
+        'cf20m486', # Household chores division
+
+        # Politics and Values 
+        'cv20l068', # Political views
+        'cv20l103', # Overall satisfaction
+        'cv20l125', # Marriage and children
+        'cv20l126', # One parent vs two
+        'cv20l130', # Divorce normalcy
+
+        # Social Integration and Leisure
+        'cs20m180', # Leisure time hours
+        'cs20m370', # Education level
+
+        # Religion and Ethnicity
+        'cr20m041',  # Religiosity    
+        'cr20m093', # Speaking Dutch with partner
+        'cr20m094', # Speaking Dutch with children
+
+        # Economic Situation Assets
+        'ca20g023', # car value
+        # Economic Situation Housing
+        'cd20m024',  # property value 2020
+        # Economic Situation Income
+        'ci20m043', # Work situation
+        'ci20m309', # Paying for children's expenses (multiple answer)
+
+        # Health
+        'ch20m219', # gynaecologist
+
+        # Background
+        'migration_background_bg',
+        'age_bg',
     ]
-    golden_features_df = df[golden_features + ['nomem_encr']]
 
-    # process background df
-    background_df_processed = process_background_df(background_df=background_df, train_df=df, wave_filter=201101)
+    features_background_impute = [        
+        # Background
+        'belbezig_2020', # occupation
+        'brutoink_f_2020', # gross income
+        'nettoink_f_2020', # net income
+        'burgstat_2020', # civil status
+        'oplcat_2020', # education
+        'partner_2020', # lives with partner
+        'sted_2020', # urban type
+        'woning_2020', # dwelling type
+        'woonvorm_2020' # domestic situation
+        ]
+
+    feature_2020_notimpute = [
+
+        # Family & Household
+        'cf20m130', # within how many years do you hope to have your first child
+        'cf20m129', # How many more children
+        'cf20m128', # Do you want another child ?
+        
+        'cf20m025', # Living with partner
+        'cf20m030', # Are you maried
+        'cf20m402', # Same partner
+        'cf20m032', # gender partner
+        'cf20m166', # How satisfied with situation as single
+
+        'cf20m471', # Children passed away
+
+        # Politics and Values 
+        'cv20l103', # Overall satisfaction
+        'cv20l125', # Marriage and children
+        'cv20l126', # One parent vs two
+        'cv20l130', # Divorce normalcy
+
+        # Health
+        'ch20m219', # gynaecologist
+
+        # Background
+        'migration_background_bg',
+        'age_bg',
+        'belbezig_2020', # occupation
+        'brutoink_f_2020', # gross income
+        'nettoink_f_2020', # net income
+        'burgstat_2020', # civil status
+        'oplcat_2020', # education
+        'partner_2020', # lives with partner
+        'sted_2020', # urban type
+        'woning_2020', # dwelling type
+        'woonvorm_2020' # domestic situation
+    ]
+
+    child_birth_years = [        
+        'cf20m456', # First child's birth year
+        'cf20m457', # Second child's birth year
+        'cf20m458', # Third child's birth year
+        'cf20m459', # Fourth child's birth year
+        'cf20m460', # Fifth child's birth year
+        'cf20m461', # Sixth child's birth year
+        'cf20m462', # Seventh child's birth year
+        'cf20m463', # Eighth child's birth year
+        'cf20m464', # Ninth child's birth year
+        'cf20m465', # Tenth child's birth year
+        'cf20m466', # Eleventh child's birth year
+        'cf20m467', # Twelfth child's birth year
+        'cf20m468', # Thirteenth child's birth year
+        'cf20m469', # Fourteenth child's birth year
+        'cf20m470', # Fifteenth child's birth year
+        ]
+    
+    # imputation
+    codebook_df = pd.read_csv('PreFer_codebook.csv', low_memory=False)
+    df_impute_noback = pd.merge(df[['nomem_encr']], inpute_na(df, feature_2020_impute, codebook_df, method = ''), left_index=True, right_index=True)
+    df_impute_back = pd.merge(df[['nomem_encr']], inpute_na(df, features_background_impute, codebook_df), left_index=True, right_index=True)
+    df_impute = pd.merge(df_impute_back, df_impute_noback, on = ['nomem_encr'], how = 'inner')
+    
+    # year last child, how many children
+    df_impute['year_last_child'] = df_impute[child_birth_years].max(axis=1, skipna=True)
+    df_impute['num_children'] = df_impute[child_birth_years].notna().sum(axis=1)
+    df_impute.drop(columns=child_birth_years, inplace=True)
+
+    # add raw features 
+    for c in df_impute.columns:
+        if c != 'nomem_encr':
+            df_impute.rename(columns={c: f'{c}_imputed'}, inplace=True)
+    df_new = pd.merge(df_impute, df[feature_2020_notimpute+['nomem_encr']],  on = 'nomem_encr', how = 'inner')
+
+    feature_super_gold  =  [    
+        # within how many years do you hope to have your first child
+        'cf20m130', # 2020
+        'cf19l130', # 2019
+        'cf18k130', # 2018
+        'cf17j130', # 2017
+        'cf16i130', # 2016
+        'cf15h130', # 2015
+        'cf14g130', # 2014
+        'cf13f130', # 2013
+        'cf12e130', # 2012
+        'cf11d130', # 2011
+        'cf09b130', # 2009
+        'cf08a130', # 2008
+        ] 
+
+    df_zero = imputation_cf20_130(feature_super_gold, train_df=df)
+    df_negative = imputation_cf20_130_negative(feature_super_gold, train_df=df)
+    df_super_gold_imputed = pd.merge(df_zero, df_negative, on = 'nomem_encr', how = 'inner')
+
+    df2 = pd.merge(df_super_gold_imputed, df_new, on = 'nomem_encr', how = 'inner')
 
     # process background df
+    background_df_processed = process_background_df(background_df=background_df, train_df=df, wave_filter=201101)
+    background_gold = [
+        'actual_household_gross_monthly_income_qt',
+        'actual_household_net_monthly_income_qt',
+        'actual_household_gross_monthly_income_med_qt',
+        'actual_household_gross_monthly_income_std_qt',
+        'actual_household_net_monthly_income_std_qt',
+        'age_qt',
+        'gender_ds',
+        'got_married_fl',
+        'actual_household_net_monthly_income_med_qt']
+    background_df_processed = background_df_processed[background_gold]
+
+    # same sex
+    df2['cf20m032_imputed'] = df2['cf20m032_imputed'].replace({1: 'male', 2: 'female'})
+    df3 = pd.merge(df2, background_df_processed['gender_ds'], on = 'nomem_encr', how='left')
+    df3['same_sex_ds'] = df3['cf20m032_imputed'] == df3['gender_ds']
+    df3.drop(columns=['cf20m032', 'cf20m032_imputed', 'gender_ds'], inplace=True)
+
+    # big five
     bigfive_df = personality_bigfive(train_df=df)
 
     # merge preprocessed info with train data
     df = pd.merge(id_df, background_df_processed, on='nomem_encr', how='left')
-    df = pd.merge(df, golden_features_df, on='nomem_encr', how='left')
+    df = pd.merge(df, df3, on='nomem_encr', how='left')
     df = pd.merge(df, bigfive_df, on='nomem_encr', how='left')
 
-    # convert numerical features from surveys to string
-    list_golden_features_cat = ['cf20m245','cf20m248','cf20m202','cf20m203','cf20m204']
-    df[list_golden_features_cat] = df[list_golden_features_cat].astype('str')
-    for f in list_golden_features_cat:
-        df[f] = df[f].replace('nan', 'missing')
-        df.rename(columns = {f : f'{f}_ds'}, inplace = True)
-
-    # create dummy variables for selected features (important and na correlated with response)
-    to_dummy_na_list = [
-        'cf20m130',
-        'cf20m130',
-        'cf19l130',
-        'cf18k130',
-        'cf17j130',
-        'cf14g130',
-        'cf15h130',
-        'cf16i130',
-        'cf20m031',
-        'cf19l031',
-        'cf20m028',
-        'cf20m029',
-        'cf19l029',
-        'cf20m251',
-        'cf20m249'
-    ]
-    for f in to_dummy_na_list:
-        df[f'{f}_isna_fl'] = (df[f].isna())*1.0
-
-    # input missing for categorical features
-    features = df.columns.tolist()
-
-    cat_features = [col for col in features if col.endswith('_ds')]
+    cat_features = [col for col in df.columns.tolist() if col.endswith('_ds')]
     df[cat_features] = df[cat_features].fillna('missing')
+    for c in cat_features:
+        df[c] = df[c].astype('category')
 
+    features = df.columns.tolist()
     df = df[features]
 
     return df
 
 
+
 def predict_outcomes(df, background_df=None, model_path="model.joblib"):
     """Generate predictions using the saved model and the input dataframe.
 
@@ -153,12 +260,11 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
     pd.DataFrame: A dataframe containing the identifiers and their corresponding predictions.
     """
 
-    ## This script contains a bare minimum working example
     if "nomem_encr" not in df.columns:
         print("The identifier variable 'nomem_encr' should be in the dataset")
 
     # load the model
-    model = joblib.load(model_path)
+    models = joblib.load(model_path)
 
     # preprocess the fake / holdout data
     df = clean_df(df=df, background_df=background_df)
@@ -167,7 +273,13 @@ def predict_outcomes(df, background_df=None, model_path="model.joblib"):
     vars_without_id = df.columns[df.columns != 'nomem_encr']
 
     # generate predictions from model, should be 0 (no child) or 1 (had child)
-    predictions = model.predict(df[vars_without_id])
+    et_preds = models['et'].predict_proba(df[vars_without_id])[:, 1]
+    cb_preds = models['cb'].predict_proba(df[vars_without_id])[:, 1]
+    lgb_preds = models['lgb'].predict_proba(df[vars_without_id])[:, 1]
+
+    # average prediction for class 1
+    final_preds = cb_preds*0.5 + et_preds*0.25 + lgb_preds*0.25
+    predictions = final_preds.round()
 
     # output file should be DataFrame with two columns, nomem_encr and predictions
     df_predict = pd.DataFrame(
@@ -345,20 +457,95 @@ def personality_bigfive(train_df):
     return fa_df_tomodel
 
 
-def inpute_na(train_df, var_list, codebook_df):
+def inpute_na(train_df, var_list, codebook_df, method='var_label'):
 
     out_df = train_df.copy()
     out_df = out_df[var_list]
     print(f'% missing for values for selected variables:\nbefore: {out_df.isna().mean(axis=1).mean():.2%}')
 
     for var_name in var_list:
-
-        var_label = codebook_df['var_label'][codebook_df ['var_name']==var_name].values[0]
-        var_name_hist_codebook = codebook_df[codebook_df ['var_label']==var_label]
+        survey_tmp = codebook_df.survey[codebook_df.var_name == var_name].values[0]
+        if method=='var_label':
+            var_label = codebook_df['var_label'][codebook_df ['var_name']==var_name].values[0]
+            var_name_hist_codebook = codebook_df[codebook_df ['var_label']==var_label]
+            var_name_hist_codebook = var_name_hist_codebook.loc[var_name_hist_codebook.survey.str.contains(survey_tmp),:]
+
+        else:
+            var_name_hist_codebook = codebook_df[codebook_df['var_name'].str.startswith(var_name[:2]) & codebook_df['var_name'].str.endswith(var_name[-3:])]
+        
         var_name_hist = var_name_hist_codebook.sort_values(by='year', ascending=False)['var_name']
-
+        
         tmp = train_df[var_name_hist]
         out_df[var_name] = tmp.bfill(axis=1).iloc[:, 0]
         
     print(f'after: {out_df.isna().mean(axis=1).mean():.2%}')
-    return out_df
\ No newline at end of file
+    return out_df
+
+
+def imputation_cf20_130(list_features, train_df):
+
+    prova_train = train_df[list_features + ['nomem_encr']]
+
+    new_cf20m130 = []
+    indici_salvati = prova_train['nomem_encr']
+    prova_prova_train = prova_train.fillna(-1000).drop(columns=['nomem_encr'])
+    valore_nan = -1000
+
+    for i in prova_prova_train.index:
+        if prova_prova_train['cf20m130'][i] == valore_nan:
+            count_col = 0
+            for col in prova_prova_train.columns[::-1]:
+                count_col = count_col + 1
+                if prova_prova_train[col][i] != valore_nan:
+                    new_value = prova_prova_train[col][i] - (count_col-1)
+                    if new_value > 0:
+                        new_cf20m130.append([indici_salvati[i],new_value])
+                    else:
+                        new_cf20m130.append([indici_salvati[i],0])
+                    break
+                if count_col == prova_prova_train.shape[1]:
+                    new_cf20m130.append([indici_salvati[i],float('nan')])
+        else:
+            if prova_prova_train['cf20m130'][i]>2000:
+                new_cf20m130.append([indici_salvati[i],prova_prova_train['cf20m130'][i]-2020])
+            else:
+                new_cf20m130.append([indici_salvati[i],prova_prova_train['cf20m130'][i]])
+
+                
+    new_cf20m130 = pd.DataFrame(new_cf20m130, columns=['nomem_encr','cf20m130_zero'])
+    
+    return new_cf20m130
+
+
+def imputation_cf20_130_negative(list_features, train_df):
+
+    prova_train = train_df[list_features + ['nomem_encr']]
+
+    new_cf20m130 = []
+    indici_salvati = prova_train['nomem_encr']
+    prova_prova_train = prova_train.fillna(-1000).drop(columns=['nomem_encr'])
+    valore_nan = -1000
+
+    for i in prova_prova_train.index:
+        if prova_prova_train['cf20m130'][i] == valore_nan:
+            count_col = 0
+            for col in prova_prova_train.columns[::-1]:
+                count_col = count_col + 1
+                if prova_prova_train[col][i] != valore_nan:
+                    new_value = prova_prova_train[col][i] - (count_col-1)
+                    # if new_value > 0:
+                    new_cf20m130.append([indici_salvati[i],new_value])
+                    # else:
+                    #     new_cf20m130.append([indici_salvati[i],0])
+                    break
+                if count_col == prova_prova_train.shape[1]:
+                    new_cf20m130.append([indici_salvati[i],float('nan')])
+        else:
+            if prova_prova_train['cf20m130'][i]>2000:
+                new_cf20m130.append([indici_salvati[i],prova_prova_train['cf20m130'][i]-2020])
+            else:
+                new_cf20m130.append([indici_salvati[i],prova_prova_train['cf20m130'][i]])
+                
+    new_cf20m130 = pd.DataFrame(new_cf20m130, columns=['nomem_encr','cf20m130_negative'])
+    
+    return new_cf20m130
\ No newline at end of file
diff --git a/training.py b/training.py
index 9b4a916..7ff0a74 100644
--- a/training.py
+++ b/training.py
@@ -8,16 +8,16 @@
 from sklearn.impute import SimpleImputer
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
-from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, cross_validate
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
-from sklearn.ensemble import VotingClassifier, StackingClassifier
+from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from sklearn.ensemble import ExtraTreesClassifier
 from catboost import CatBoostClassifier
+from lightgbm import LGBMClassifier
 
 from submission import clean_df
 
 
-def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluate: bool=False)-> None:
+def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluate: bool=False, tune: bool=False)-> None:
     """
     Train and tune a CatBoostClassifier model on the baseline + background features and save it.
 
@@ -32,6 +32,9 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
     # prepare input data
     model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")
 
+        # prepare input data
+    model_df = pd.merge(cleaned_df, outcome_df, on="nomem_encr")
+
     # define the preprocessing pipeline
     features = [c for c in model_df.columns if c not in ['nomem_encr', 'new_child']]
     cat_features = [col for col in features if col.endswith('_ds')]
@@ -41,50 +44,92 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
         ('imputer', SimpleImputer(strategy='median')),
         ('scaler', StandardScaler())])
 
-    categorical_transformer = Pipeline(steps=[
-        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
-        ('onehot', OneHotEncoder(handle_unknown='ignore'))])
+    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])
 
     preprocessor = ColumnTransformer(
         transformers=[
             ('num', numeric_transformer, num_features),
             ('cat', categorical_transformer, cat_features)])
-
-    # model = VotingClassifier(estimators=[
-    #         ('lr', Pipeline(steps=[('preprocessor', preprocessor), ('clf', LogisticRegression(class_weight='balanced', random_state=42))])),
-    #         ('rf', Pipeline(steps=[('preprocessor', preprocessor), ('clf', RandomForestClassifier(n_estimators=300, random_state=42))])),
-    #         ('cb', CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42))
-    #     ], voting='hard')
-    model = StackingClassifier(
-        estimators = [
-            ('rf', Pipeline(steps=[('preprocessor', preprocessor), ('clf', RandomForestClassifier(n_estimators=300, random_state=42))])),
-            ('cb', CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42))
-            ],
-        final_estimator = LogisticRegression(class_weight='balanced', random_state=42),
-        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
-        )
     
+    params_catboost = {
+        'iterations': 1000,
+        'learning_rate': 0.04465495649788828,
+        'depth': 7,
+        'subsample': 0.5990716998946282, 
+        'colsample_bylevel': 0.15856264300042117, 
+        'min_data_in_leaf': 48
+        }
+    params_lgb = {
+        'bagging_fraction': 0.8, 
+        'feature_fraction': 0.9, 
+        'learning_rate': 0.1, 
+        'max_bin': 20, 
+        'max_depth': 30, 
+        'min_data_in_leaf': 20, 
+        'min_sum_hessian_in_leaf': 0.001, 
+        'n_estimators': 3246, 
+        'num_leaves': 24, 
+        'subsample': 1.0
+        }
+
+    models = {
+        'et': Pipeline(steps=[('preprocessor', preprocessor), ('clf', ExtraTreesClassifier(n_estimators=500, max_features=0.3, random_state=42))]),
+        'cb': CatBoostClassifier(cat_features=cat_features, verbose=False, random_state=42, **params_catboost),
+        'lgb':  LGBMClassifier(boosting_type = 'gbdt', random_state=42, verbose=-1, class_weight = 'balanced', **params_lgb)
+    }
+
     # fit the model
-    model.fit(model_df[features], model_df['new_child'])
+    models['et'].fit(model_df[features], model_df['new_child'])
+    models['cb'].fit(model_df[features], model_df['new_child'])
+    models['lgb'].fit(model_df[features], model_df['new_child'], categorical_feature=cat_features)
 
     # save the model and params
-    joblib.dump(model, Path(__file__).parent / f"model.joblib")
+    joblib.dump(models, Path(__file__).parent / f"model.joblib")
 
     if evaluate == True:
 
-        print('Performing cross validation...')
-
-        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
-        scoring = ['accuracy', 'precision', 'recall', 'f1']
-
-        # perform cross validation 
-        cv_results = cross_validate(model, model_df[features], model_df['new_child'], cv=cv, scoring=scoring)
-
-        # extract metrics from cv_results
-        accuracy = cv_results['test_accuracy'].mean()
-        precision = cv_results['test_precision'].mean()
-        recall = cv_results['test_recall'].mean()
-        f1 = cv_results['test_f1'].mean()
+        print('\nPerforming cross validation...')
+
+        # cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+        # cv = RepeatedStratifiedKFold(n_splits=2, n_repeats=10, random_state=42)
+        cv = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=1927)
+        cv_results = []
+
+        for train_index, test_index in tqdm(cv.split(model_df[features], model_df['new_child'])):
+            X_train, X_test = model_df.iloc[train_index][features], model_df.iloc[test_index][features]
+            y_train, y_test = model_df.iloc[train_index]['new_child'], model_df.iloc[test_index]['new_child']
+
+            # fit models
+            models['et'].fit(X_train, y_train)
+            models['cb'].fit(X_train, y_train)
+            models['lgb'].fit(X_train, y_train, categorical_feature=cat_features)
+
+            # predictions
+            et_preds = models['et'].predict_proba(X_test)[:, 1]
+            cb_preds = models['cb'].predict_proba(X_test)[:, 1]
+            lgb_preds = models['lgb'].predict_proba(X_test)[:, 1]
+
+            # average prediction for class 1
+            final_preds = cb_preds*0.5 + et_preds*0.25 + lgb_preds*0.25
+
+            # metrics
+            acc = accuracy_score(y_test, final_preds.round())
+            prec = precision_score(y_test, final_preds.round())
+            rec = recall_score(y_test, final_preds.round())
+            f1 = f1_score(y_test, final_preds.round())
+
+            cv_results.append([acc, prec, rec, f1])
+        
+        # extract metrics from cv_results    
+        accuracy_scores = [result[0] for result in cv_results]
+        precision_scores = [result[1] for result in cv_results]
+        recall_scores = [result[2] for result in cv_results]
+        f1_scores = [result[3] for result in cv_results]
+
+        accuracy = np.mean(accuracy_scores)
+        precision = np.mean(precision_scores)
+        recall = np.mean(recall_scores)
+        f1 = np.mean(f1_scores)
 
         results_df = pd.DataFrame({
             'model_name': ['prefer'], 
@@ -98,10 +143,10 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
 
         # print cv metrics
         print('CV metrics:')
-        print(f"\taccuracy: {accuracy:.4f} ({cv_results['test_accuracy'].std():.4f})")
-        print(f"\tprecision: {precision:.4f} ({cv_results['test_precision'].std():.4f})")
-        print(f"\trecall: {recall:.4f} ({cv_results['test_recall'].std():.4f})")
-        print(f"\tf1 score: {f1:.4f} ({cv_results['test_f1'].std():.4f})")        
+        print(f"\taccuracy: {accuracy:.4f} ({np.std(accuracy_scores):.4f})")
+        print(f"\tprecision: {precision:.4f} ({np.std(precision_scores):.4f})")
+        print(f"\trecall: {recall:.4f} ({np.std(recall_scores):.4f})")
+        print(f"\tf1 score: {f1:.4f} ({np.std(f1_scores):.4f})")      
 
 
 if __name__ == '__main__':
@@ -110,7 +155,7 @@ def train_save_model(cleaned_df: pd.DataFrame, outcome_df: pd.DataFrame, evaluat
     parent_proj_dir = proj_dir.parent
     data_dir = parent_proj_dir / 'prefer_data'
 
-    evaluate = False
+    evaluate = True
 
     # import data
     print('Loading data...')