-
Notifications
You must be signed in to change notification settings - Fork 0
/
CBD_pipeline_SVM_HPC.py
3115 lines (2945 loc) · 211 KB
/
CBD_pipeline_SVM_HPC.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
########################################################################################################################
# HPC PARALLELIZATION SCRIPT WITH IPYPARALLEL BACKEND ##################################################################
# REMOVING HIGHLY CORRELATED FEATURES, RESAMPLING, FEATURE TRANSFORMATION, PARAMETER GRID SEARCH, DATA SPLIT BY GENDER #
# Jeff DIDIER - Faculty of Science, Technology and Medicine (FSTM), Department of Life Sciences and Medicine (DLSM) ####
# November 2021 - March 2023, University of Luxembourg, v.03/16/2023 (M/d/y) ###########################################
########################################################################################################################
# SUMMARY: Full clinical cohort data as well as split data based on gender, updated and revised functions and comments,
# split pipeline and grid search, adopted prints, save figures path, performance summary for all 3 data type cases,
# removing constant and near-constant features, feature importance, removing highly correlated features, removing
# features used for engineering, added visualizations for highly correlated features and feature importance evaluation,
# select subgroups, transformed everything into functions, added a configuration script, enable and disable several
# steps, high reproducibility
# /!\ TO CONSIDER DEPENDING IF RUNNING ON HPC OR LOCAL MACHINES: /!\ #
# ------------------------------------------------------------------ #
# First have a look in the script configuration options section of part 1 and adapt if needed!
# Regarding HPC: verbose=0 | False, n_jobs=number of ip-engines, cache_size=200, exhaustive grid search intervals
# Regarding local machines: verbose=[1, 2] or True, n_jobs=-1, cache_size=2000, reduced grid search intervals
# Set parallel_method to ipyparallel to enable HPC client, and threading or multiprocess for local machines
# ON HPC: Check available modules, create python environment, install requirements, create directories, import data e.g.
# ./data/train_imputed.csv and ./data/test_imputed.csv, sync script, config, utils and launcher files.
# Run script on HPC using 'sbatch HPC_SVM_launcher.sh CBD_pipeline_SVM_HPC.py' after the configurations in
# CBDP_config.py are set to your needs.
# REQUIRED FILES: CBD_pipeline_SVM_HPC.py, CBDP_utils.py, CBDP_config.py, HPC_SVM_launcher.sh, requirements.txt;
# ./env/eli5/permutation_importance.py, ./env/mlxtend/evaluate/feature_importance_permutation.py,
# ./env/mlxtend/evaluate/__init__.py adapted for parallelization and ./env/imblearn/base.py for updated SMOTENC,
# shuffle_me.py for explanation of shuffle numbers. All necessary files can be found here:
# https://github.com/sysbiolux/Clinical_Biomarker_Detection/tree/main/source
# Global session is saved for each kernel in '-global-save.pkl' file, the main script execution output is collected
# in the generated log file if running on HPC. Location: log/job_ID/code_jobID_execution.out
# /!\ CURRENT WARNINGS / ERRORS ENCOUNTERED: /!\ #
# ---------------------------------------------- #
# OutdatedPackageWarning: The package outdated is out of date. Your version is 0.2.1, the latest is 0.2.2.
# Set the environment variable OUTDATED_IGNORE=1 to disable these warnings. SUPPRESSED, MINOR CHANGES NOT WORTH UPDATING
# OutdatedPackageWarning: The package pingouin is out of date. Your version is 0.5.0, the latest is 0.5.3.
# Set the environment variable OUTDATED_IGNORE=1 to disable these warnings. SUPPRESSED, MINOR CHANGES NOT WORTH UPDATING
# ConvergenceWarning: Solver terminated early (max_iter=150000). Consider pre-processing your data with StandardScaler
# or MinMaxScaler. IGNORED, HAPPENS WHEN USING ROBUST SCALER OR MAX ITER OF THE CLASSIFIER IS REACHED (SEE CONFIG)
# RuntimeWarning: invalid value encountered in true_divide c_matrix = c_matrix.astype('float') / c_matrix.sum(axis=1)
# [:, np.newaxis]. IGNORED, APPEARS WHEN CONFUSION MATRICES OF TAGGED FEATURES ARE DRAWN OF ONLY POSITIVE TARGETS
########################################################################################################################
# ## PART 0: SCRIPT START IMPORTING LIBRARIES ##########################################################################
########################################################################################################################
################################################
# ## Importing libraries and configuration file
################################################
# Libraries
import argparse
import logging
import math
import matplotlib
import random
import sys
import warnings
import pandas as pd
from eli5.permutation_importance import get_score_importances
from imblearn.over_sampling import SMOTE, SMOTEN, SMOTENC
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from ipyparallel import Client
from ipyparallel.joblib import IPythonParallelBackend
from joblib import cpu_count, register_parallel_backend
from mlxtend.evaluate import feature_importance_permutation
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
# Control output printing destination (original = console)
orig_stdout = sys.__stdout__
# Starting the script and try to load the dependent files
print(f"\n################################################\n################################################")
# Pipeline logo
logo = '\n _________________ ________ ______\n'\
' / ______/ ___ \\/ ____ \\ / ___ \\\n'\
'/ / | /__/ / / | | ___ / /__/ /\n'\
'| | / ___ </ / / / /__// _____/\n'\
'\\ \\____/ /___\\ \\ /____/ / / /\n'\
' \\_____/__________/________/ /__/ v.03/16/2023 (M/d/y)\n'\
'---=====================================---\n'\
' CLINICAL BIOMARKER DETECTION - PIPELINE\n\n'
print(logo)
print(f"For the documentation see the link below:\n"
f"https://github.com/sysbiolux/Clinical_Biomarker_Detection#readme\n\n"
f"Starting the Clinical Biomarker Detection Pipeline v.03/16/2023.\n\n")
# Loading the CBD-P utils file
print(f"******************************************\nLOADING DEPENDENT FILES:\n\nLoading the CBD-P utils file ...")
try:
from source.CBDP_utils import *
# import all CBDP related functions
print("CBD-P utils file loaded successfully!\n")
except ImportError('CBD-P utils file could not be found or loaded correctly.'):
exit()
# Loading the CBD-P configuration file
print(f"Loading the CBD-P configuration file ...")
try:
from CBDP_config import *
# import the related configurations
print("CBD-P configuration file loaded successfully!\n")
except ImportError('CBD-P configuration file could not be found or loaded correctly.'):
exit()
########################################################################################################################
# ## PART 1: DOUBLE CHECK IF CONFIGURATION VARIABLES ARE LEGAL #########################################################
########################################################################################################################
###################################################################################
# ## Safety measures for disabled parameters and total parameter dictionary update
###################################################################################
# TODO CLEAN SAFETY RESETS AND VARIABLE CHECKS
# Safety measure to surely reset removing highly correlated feature parameters if disabled, else restore
if not enable_rhcf:
thresh_cramer, thresh_spearman, thresh_pbs = [None] * 3
else:
thresh_cramer, thresh_spearman, thresh_pbs = thresh_cramer, thresh_spearman, thresh_pbs
# Safety measure for PCA versus LDA (currently only one can be selected)
if pca_tech in ('normal_pca', 'kernel_pca') and da_tech == 'lda' and enable_ft:
# prioritize pca_tech over da_tech if both are given (currently only one allowed)
pca_tech = pca_tech
da_tech = ''
elif da_tech == 'lda' and enable_ft:
pca_tech = ''
da_tech = da_tech
elif da_tech == '' and pca_tech in ('normal_pca', 'kernel_pca') and enable_ft:
pca_tech = pca_tech
da_tech = da_tech
else:
pca_tech = ''
da_tech = ''
print(f'**No continuous feature transformation technique selected or feature transformation is completely '
f'disabled.\nStatus of feature transformation step: {enable_ft}**')
# Safety measure for kbest technique, reset to chi2 if not among recommendations or callable function
if kbest_tech not in ('chi2', '', 'cramer'):
print('**Kbest select technique not among the recommended settings. '
'Check if a callable score function was given.**')
if not hasattr(kbest_tech, '__call__'):
kbest_tech = 'chi2'
print('**No callable score function given. K best is reset to chi2. Check config is this is not desired.**')
# Safety measure to destroy kbest_tech if ft is not enabled
if enable_ft:
kbest_tech = kbest_tech
else:
kbest_tech = ''
print(f'**No categorical feature transformation technique selected or feature transformation is completely '
f'disabled.\nStatus of feature transformation step: {enable_ft}**')
# Safety measure to surely reset feature transformation parameters and dictionary if disabled, else restore
if not (enable_ft and pca_tech == 'normal_pca'):
pca_lpsr = [None]
# Update dictionary depending on enabled pipeline steps
total_params_and_splits.update({'pca_lpsr': pca_lpsr})
else:
pca_lpsr = pca_lpsr
if not (enable_ft and kbest_tech != ''):
k_best_lpsr = [None]
total_params_and_splits.update({'k_best_lpsr': k_best_lpsr})
else:
k_best_lpsr = k_best_lpsr
# Safety measure to surely reset resampling technique if disabled, else restore
if not enable_resampling:
resampling_tech = ''
else:
resampling_tech = resampling_tech
# Safety measure to surely reset resampling parameters and dictionary if disabled, else restore
if resampling_tech != 'smote':
k_neighbors_smote_lpsr = [None]
# Update dictionary depending on enabled pipeline steps
total_params_and_splits.update({'k_neighbors_smote_lpsr': k_neighbors_smote_lpsr})
else:
k_neighbors_smote_lpsr = k_neighbors_smote_lpsr
# Safety measure to surely reset pipeline order if resampling and feature transformation are disabled
if not (enable_ft or enable_resampling):
pipeline_order = 'FT and resampling disabled, only standardization applies'
else:
pipeline_order = pipeline_order
# Safety measure to surely reset engineered input prefix if disabled
if not enable_engineered_input_removal:
engineered_input_prefix = ''
else:
engineered_input_prefix = engineered_input_prefix
# Safety measure to surely reset pca tech if disabled
if not enable_ft:
pca_tech = ''
pca_kernel_dict.update({key: [None] for key, items in pca_kernel_dict.items()})
pca_lpsr = [None]
else:
pca_tech, pca_lpsr = pca_tech, pca_lpsr
# Update kernel pca dictionary to total params and splits if kernel pca activated and reset SVM kernels to linear
if pca_tech == 'kernel_pca':
kernels = ['linear'] # Change possible SVM kernels to linear if kPCA is on, so only linear SVC parameter are loaded
non_linear_kernels = [None]
gamma_psr, coef0_ps, degree_p = [[None]] * 3
total_params_and_splits.update({'gamma_psr': gamma_psr,
'coef0_ps': coef0_ps,
'degree_p': degree_p})
if 'poly' in kernel_pca_kernel_lpsr:
total_params_and_splits.update(pca_kernel_dict)
elif 'rbf' in kernel_pca_kernel_lpsr:
total_params_and_splits.update(
{k: v for k, v in pca_kernel_dict.items() if k not in ('kpca_coef0_lpsr', 'kpca_degree_lpsr')})
elif 'sigmoid' in kernel_pca_kernel_lpsr:
total_params_and_splits.update(
{k: v for k, v in pca_kernel_dict.items() if k not in ['kpca_degree_lpsr']})
else:
kernels = kernels
non_linear_kernels = non_linear_kernels
gamma_psr, coef0_ps, degree_p = gamma_psr, coef0_ps, degree_p
# Be sure that scaler tech is set, cannot be empty, thus reset to default if not set
if scaler_tech not in ('standard', 'minmax', 'robust'):
scaler_tech = 'standard'
warnings.warn("**Scaler technique was not set in the configuration file. Default 'standard' is loaded.**")
# If da_tech is set to 'lda', update total_params_and_splits
if enable_ft and da_tech == 'lda' and pca_tech == '':
total_params_and_splits.update(lda_dict)
# Reset splitting feature if disabled
if not enable_data_split:
split_feature = ''
else:
split_feature = split_feature
# Reset subgroups to keep if disabled
if not enable_subgroups:
subgroups_to_keep = 'all'
else:
subgroups_to_keep = subgroups_to_keep
# Reset scorer if not among implemented possibilities
if scorer not in ('F.5', 'F1', 'F2', 'F5', 'roc_auc', 'accuracy', 'balanced_accuracy', 'matthews_corrcoef', 'dor'):
scorer = 'accuracy'
warnings.warn("**Scorer was not among the possible scores. Default 'accuracy' is loaded.**")
# Reset feature importance method settings
if not enable_feature_importance:
feature_importance_method = ''
enable_box_bar_plots = False
elif feature_importance_method not in ('sklearn', 'mlxtend', 'eli5', 'all'):
feature_importance_method = 'all'
warnings.warn("**Feature importance method not set correctly. Default 'all' is loaded.**")
else:
enable_box_bar_plots, feature_importance_method = enable_box_bar_plots, feature_importance_method
# Reset box and bar plot settings dependent on the feature importance
if not enable_box_bar_plots:
box_bar_figures = ''
elif box_bar_figures not in ('separated', 'combined'):
box_bar_figures = 'combined'
warnings.warn("**Plot setting for box and bar plots are not set correctly. Default 'combined' is loaded.**")
else:
box_bar_figures = box_bar_figures
# check if target feature and positive/negative classes are given
for string in (output_feature, positive_class, negative_class):
if len(string) == 0:
raise TypeError("**One or more of the following information is missing in the configuration file to start the "
"pipeline: output_feature, positive_class, or negative_class. Got %s instead.**" % string)
# check figure format
if figure_format not in ('png', 'tif', 'tiff', 'jpg', 'jpeg'):
figure_format = 'png'
warnings.warn("**Figure format setting not among the suggested ones. Default 'png' is loaded.**")
# check if kernels and non linear kernels are properly defined
possible_non_linear_svm_kernels = ['poly', 'rbf', 'sigmoid']
tmp = []
for kern in kernels:
if kern in possible_non_linear_svm_kernels:
tmp.append(kern)
if set(non_linear_kernels) != set(tmp):
non_linear_kernels = tmp if len(tmp) > 0 else [None]
else:
non_linear_kernels = non_linear_kernels
if drop_or_pass_non_treated_features not in ('drop', 'passthrough'):
drop_or_pass_non_treated_features = 'drop'
warnings.warn("**Decision to drop or passthrough features in column-transformer that are not transformed is not "
"valid. Default 'drop' is loaded.**")
##################################
# ## Configuration variable check
##################################
# Variables check that should strictly be an integer
config_int = [seed, fix_font, imp_font, fig_max_open_warning, pandas_col_display_option, cache_size, grid_verbose,
hard_iter_cap, splits, shuffle_all, shuffle_male, shuffle_female, n_jobs, tiff_figure_dpi]
if not all(isinstance(i, int) for i in config_int):
raise TypeError('The following configured variables must be integers: seed, fix_font, imp_font, '
'fig_max_open_warning, pandas_col_display_option, cache_size, grid_verbose, hard_iter_cap, splits, '
'shuffle_all, shuffle_male, shuffle_female, n_jobs, tiff_figure_dpi. Got %s instead.' % config_int)
# Variables check that should strictly be floating values
config_float = [thresh_near_constant]
if not all(isinstance(i, float) for i in config_float):
raise TypeError('The following configured variables must be float: thresh_near_constant. Got %s instead.'
% config_int)
# Variables check that should strictly be a directory or path
if not os.path.isdir(curr_dir):
raise IOError('The current directory is not set or recognized as such. Got current directory: %s.' % curr_dir)
if not all(os.path.isfile(i) for i in [train_path, test_path]):
raise FileNotFoundError("The given train and test set pathways are not set or can't be found. "
"Got train: %s and test: %s." % (train_path, test_path))
# Variables check that should strictly be a string
config_str = [plot_style, pipeline_order, output_feature, split_feature, decision_func_shape, parallel_method,
resampling_tech, folder_prefix, pca_tech, da_tech, scaler_tech, scorer, feature_importance_method,
box_bar_figures, negative_class, positive_class, kbest_tech, drop_or_pass_non_treated_features,
figure_format]
if not (all(isinstance(i, str) for i in config_str)):
if not hasattr(kbest_tech, '__call__'):
raise TypeError('The following configured variables must be single strings: plot_style, pipeline_order, '
'output_feature, split_feature, decision_func_shape, parallel_method, folder_prefix, pca_tech, '
'da_tech, scaler_tech, scorer, feature_importance_method, box_bar_figures, negative_class, '
'positive_class, kbest_tech, drop_or_pass_non_treated_features, sample_tagging_feature. '
'Got %s instead.' % config_str)
# Variables check that should strictly be a list of strings or str
if not (all(isinstance(i, str) for i in output_related) or isinstance(output_related, list)):
raise TypeError('One or multiple of the configured output features were not recognized as str or list of str: '
'output_related. Got %s.' % output_related)
kernel_info = [kernels, non_linear_kernels, kernel_pca_kernel_lpsr]
if not (all(isinstance(i, list) for i in kernel_info) or all(isinstance(i, str) for i in kernel_info)):
raise TypeError('One or multiple of the following configured kernel list information are not strings: kernels, '
'non_linear_kernels, kernel_pca_kernel_lpsr. Got %s.' % kernel_info)
# Variables check that should strictly be a boolean
config_bool = [enable_rhcf, enable_resampling, enable_ft, clf_verbose, additional_params, enable_feature_importance,
enable_engineered_input_removal, enable_data_split, enable_subgroups, enable_box_bar_plots,
linear_shuffle]
if not all(isinstance(i, bool) for i in config_bool):
raise TypeError('The following configured variables must be boolean: enable_rhcf, enable_resampling, '
'enable_ft, clf_verbose, additional_params, enable_feature_importance, '
'enable_engineered_input_removal, enable_data_split, enable_subgroups, enable_box_bar_plots, '
'linear_shuffle. '
'Got %s instead.' % config_bool)
# Variables that could be str or tuple of str
config_tuples = [engineered_input_prefix, subgroups_to_keep, tag_threshold]
for i in config_tuples:
if not (isinstance(i, tuple) or all(isinstance(k, str) for k in i)):
raise TypeError('The following configured variable must be a tuple or str: engineered_input_prefix, '
'subgroups_to_keep, tag_threshold. Got %s instead.' % i)
# RHCF threshold variables check if remove highly correlated features is enabled
if enable_rhcf:
config_thresh_tuple = [thresh_cramer, thresh_spearman, thresh_pbs]
for i in config_thresh_tuple:
if not isinstance(i, tuple):
raise TypeError('The following configured variables must be tuples: '
'thresh_cramer, thresh_spearman, thresh_pbs. '
'Got %s instead.' % config_thresh_tuple)
if isinstance(i[1], str) and i[1] == 'decimal':
if not isinstance(i[0], float):
raise TypeError('The following threshold variable for decimal cut-off must be float. Got %s '
'instead.' % i)
elif isinstance(i[1], str) and i[1] == 'percentile':
if not isinstance(i[0], int):
raise TypeError('The following threshold variable for percentile cut-off must be int. Got %s '
'instead.' % i)
elif i[1] not in ['decimal', 'percentile']:
raise TypeError("One of the following threshold variable specifications is missing in the configured "
"variable: 'decimal' or 'percentile'. Got %s." % i)
# Check for the kernel params and split dict
config_dict = [additional_technique_params, additional_kernel_params, total_params_and_splits, pca_kernel_dict,
lda_dict]
if not all(isinstance(i, dict) for i in config_dict) and not len(total_params_and_splits) > 0:
raise TypeError('The following configured variable must be a dictionary (and above zero length if total_params..): '
'additional_technique_params, additional_kernel_params total_params_and_splits, pca_kernel_dict, '
'lda_dict. Got %s instead.' % config_dict)
# Check if all grid search parameters are legal
# Pipeline parameters
grid_search_clf_params = [regularization_lpsr, shrinking_lpsr, tolerance_lpsr, gamma_psr, degree_p, coef0_ps]
if not all(isinstance(i, list) for i in grid_search_clf_params):
raise TypeError('The following configured variables must be lists of values or strings: regularization_lpsr, '
'shrinking_lpsr, tolerance_lpsr, gamma_psr, degree_p, coef0_ps. Got %s.' % grid_search_clf_params)
# Resampling parameters
if enable_resampling:
grid_search_samples_params = [k_neighbors_smote_lpsr]
if not all(isinstance(i, list) for i in grid_search_samples_params):
raise TypeError('The following configured variable must be lists of values or strings: '
'k_neighbors_smote_lpsr. Got %s.' % grid_search_samples_params)
# Feature transformation parameters
if enable_ft:
grid_search_features_params = [pca_lpsr, k_best_lpsr, kernel_pca_lpsr, kernel_pca_gamma_lpsr,
kernel_pca_tol_lpsr, kernel_pca_degree_lpsr, kernel_pca_coef0_lpsr,
lda_shrinkage_lpsr, lda_priors_lpsr, lda_components_lpsr, lda_tol_lpsr]
if not all(isinstance(i, list) for i in grid_search_features_params):
raise TypeError('The following configured variables must be lists of values or strings: '
'pca_lpsr, k_best_lpsr, kernel_pca_lpsr, kernel_pca_gamma_lpsr, kernel_pca_tol_lpsr, '
'kernel_pca_degree_lpsr, kernel_pca_coef0_lpsr, lda_shrinkage_lpsr, lda_priors_lpsr, '
'lda_components_lpsr, lda_tol_lpsr. Got %s' % grid_search_features_params)
########################################################################################################################
# ## PART 2: SETTING UP THE RESULTS FOLDER, AND HPC OPERABILITY ########################################################
########################################################################################################################
###################################################
# ## Creating the results folder and clear content
###################################################
# /!\ Only supported pipeline steps are available and modifications are needed if new technical steps are added
# Name of fully enabled pipeline would be:
# Data split DS, Subgroups SG, Remove Engineered Input REI, Remove Highly Correlated Features RHCF
# Random Under Sampler RUS/Synthetic Minority Over-sampling Technique SMOTE -> 1 step
# Standard Scaler ST/Robust Scaler RO/Min-max Scaler MI, PCA/Kernel PCA kPCA, Feature Transformation FT -> 1 step
# Feature Importance FI, Box Bar Plotting (BBP), Support Vector Machines SVM, High Performance Computing HPC
intermediate_dict = {'SG': enable_subgroups, 'DS': enable_data_split, 'REI': enable_engineered_input_removal,
'RHCF': enable_rhcf, 'RUS_SMOTE': enable_resampling, 'PCA-FT_kPCA-FT_LDA-FT': enable_ft,
'FI': enable_feature_importance, 'BBP': enable_box_bar_plots}
# Generating the folder intermediate name depending on enabled pipeline steps
folder_intermediate, tmp, tmp1 = '', '', ''
for key, items in intermediate_dict.items():
if items:
if key.__contains__('_') and not key.endswith('FT'): # Resampling first
tmp = (key.split('_')[0] if resampling_tech == 'rus' else key.split('_')[1]) # Get first or second tech
folder_intermediate += ('-' + tmp if pipeline_order == 'samples->features' else '') # Add if order allow it
elif key.__contains__('_') and key.endswith('FT'): # Next underscore bearing is FT
tmp1 = f'{kbest_tech}KBEST-FT' if (kbest_tech in ('chi2',
'cramer') or hasattr(kbest_tech, '__call__')) else ''
# Get first, second, or third tech (lda in case of third tech)
tmp1 += ('-' + key.split('_')[0] if pca_tech == 'normal_pca' else
'-' + key.split('_')[1] if pca_tech == 'kernel_pca' else
'-' + key.split('_')[2] if da_tech == 'lda' else '')
if pca_tech == 'kernel_pca' and len(kernel_pca_kernel_lpsr) < 3:
for pca_kern in kernel_pca_kernel_lpsr:
tmp1 = pca_kern + (tmp1 if tmp1 != '' else tmp1)
folder_intermediate += '-' + scaler_tech[0:2].upper() # Before adding the FT tech, define & add scaler tech
folder_intermediate += ('-' + tmp1 if tmp1 != '' and not tmp1.startswith('-') else tmp1) # FT after scaler
# Delay resampling tech insertion after FT if pipeline order allows it
elif folder_intermediate.endswith('FT') and pipeline_order == 'features->samples':
# As this happens in the following step 'FI' after FT, tmp must be placed in between
folder_intermediate += (tmp + '-' + key if folder_intermediate.endswith('-') else '-' + tmp + '-' + key)
else: # For each true item if not yet called above
folder_intermediate += '-' + key
if key == 'PCA-FT_kPCA-FT_LDA-FT' and not enable_ft and pipeline_order == 'samples->features':
# In case of disabled FT, standard scaler is applied and should also appear in the folder name relative to order
folder_intermediate += '-' + scaler_tech[0:2].upper()
elif key == 'PCA-FT_kPCA-FT_LDA-FT' and not enable_ft and pipeline_order == 'features->samples':
folder_intermediate += '-' + scaler_tech[0:2].upper() + '-' + tmp + ('-' + tmp1 if tmp1 != '' else tmp1)
# Swap Kbest with PCA/lda if both are used (no need to consider case of kPCA, will be detected with 'PCA' anyhow
if sum([True for match in ['KBEST', 'PCA', 'LDA'] if match in folder_intermediate]) == 2:
folder_intermediate = swap_words(folder_intermediate, f'{kbest_tech}KBEST',
'kPCA' if pca_tech == 'kernel_pca' else 'PCA' if pca_tech == 'normal_pca' else
'LDA' if da_tech == 'lda' else '')
# -FT- might occur twice in the name if double transformation is selected, in that case remove the first -FT-
if folder_intermediate.count('-FT-') > 1:
folder_intermediate = folder_intermediate.replace('-FT', '', 1)
# Define the results folder name suffix based on if ipyparallel is activated (HPC-based)
folder_suffix = '-SVM' + ('-lin' if 'linear' in kernels and len(kernels) == 1
else '-non-lin' if 'linear' not in kernels else '-both-lin-and-non-lin')
if parallel_method == 'ipyparallel':
folder_suffix += '-HPC'
# Final results folder will be a combination of given prefix, intermediate name, and HPC and classifier dependent suffix
folder_name = folder_prefix\
+ folder_intermediate\
+ folder_suffix\
if not folder_prefix.endswith(('/', '\\')) else folder_prefix + folder_intermediate[1:] + folder_suffix
# Create the folder or clear the content of the final folder (if multiple) if already existing
if folder_prefix.__contains__('/'):
# If a folder in folder is given, first check if the first folder exists, if not create it (do not clear if exists)
tmp_dir = curr_dir
for slash in range(folder_prefix.count('/')):
if os.path.isdir(tmp_dir + '/' + folder_prefix.split('/')[slash]) is False:
os.mkdir(tmp_dir + '/' + folder_prefix.split('/')[slash])
tmp_dir += '/' + folder_prefix.split('/')[slash]
else:
tmp_dir += '/' + folder_prefix.split('/')[slash]
# Now as all pre folders are created, create the final results folder, if it already exists, append a 2-digit value
folder_name = folder_name + '_00' # folder _00 until _99 equal 100 folders
if os.path.isdir(curr_dir + '/' + folder_name) is False:
os.mkdir(curr_dir + '/' + folder_name)
else:
files = os.listdir(curr_dir + '/' + os.path.split(folder_name)[0])
size = len(folder_name)
count = sum([folder_name.split('/')[-1][:-3] in f for f in files])
if count >= 100: # if there are 100 folders or more (00-99), add another digit to display _100 in worst cases
folder_name = folder_name.replace(folder_name[size - 3:], f"{'%03d' % count}", 1)
else:
folder_name = folder_name.replace(folder_name[size - 3:], f"_{'%02d' % count}", 1)
os.mkdir(curr_dir + '/' + folder_name)
###############################################################
# ## HPC parallelization preparations and n_jobs configuration
###############################################################
# Source: https://ulhpc-tutorials.readthedocs.io/en/latest/python/advanced/scikit-learn/
# Enable only if running with ipyparallel on HPC
if parallel_method != 'ipyparallel':
client = None
# Open output file to redirect output prints if running on local machine
file_path = curr_dir + '/' + folder_name + '/' + 'CBD-P_output_raw.txt'
print(f"As parallel method is not set to ipyparallel, it is assumed that you are running the experiment on "
f"a local machine.\nTherefore, the raw output results of the pipeline will be flushed to the output file "
f"at the following location:\n** {file_path.replace(chr(92), '/')} **\n\nIf the process is exiting for any "
f"reason before completing the pipeline,\nplease use the following two lines to redirect the printouts to "
f"your local console.\nsys.stdout.close()\nsys.stdout = orig_stdout\n") # with chr(92) being a backslash
print("\n******************************************")
buff_size = 1
sys.stdout = open(file_path, "w", buffering=buff_size)
else:
# To know the location of the python script
FILE_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append(FILE_DIR)
# Prepare the logger
parser = argparse.ArgumentParser()
parser.add_argument("-p", "--profile", default="ipy_profile", help="Name of IPython profile to use")
args = parser.parse_args()
profile = args.profile
logging.basicConfig(filename=os.path.join(FILE_DIR, profile + '.log'), filemode='w', level=logging.DEBUG)
logging.info("number of CPUs found: {0}".format(cpu_count()))
logging.info("args.profile: {0}".format(profile))
# Prepare the engines
client = Client(profile=profile)
NB_WORKERS = int(os.environ.get("NB_WORKERS", 1))
# wait for the engines
client.wait_for_engines(NB_WORKERS)
# The following command will make sure that each engine is running in the right working directory
client[:].map(os.chdir, [FILE_DIR] * len(client))
logging.info("c.ids :{0}".format(str(client.ids)))
b_view = client.load_balanced_view()
register_parallel_backend(parallel_method, lambda: IPythonParallelBackend(view=b_view))
# Configure number of jobs
if parallel_method == 'ipyparallel':
n_jobs = len(client) # Either len(client) on HPC or -1 on local machine
else:
n_jobs = n_jobs # Assume running on local machine with different parallel backend (e.g. threading)
########################################################################################################################
# ## PART 3: MAIN PART OF THE SCRIPT WITH EXECUTABLE CODE ##############################################################
########################################################################################################################
##################################
# ## Script configuration summary
##################################
# Kernel based dictionary of enabled parameters length
kernel_param_lengths = {}
for kern in kernels:
tmp = []
for key, items in total_params_and_splits.items():
# all param lengths with underscore and containing the first letter of the corresponding kernel in the last part
tmp.append(len(items) if key.__contains__('_') and key.split('_')[-1].__contains__(kern[0])
else 1 if key.__contains__('_') or items == [None] else items)
if key.__contains__('_') and not key.split('_')[-1].__contains__(kern[0]):
total_params_and_splits.update({key: [None]})
kernel_param_lengths[kern] = tmp
# Number of total fits for each selected kernel
total_fits = {}
for kern in kernels:
total_fits[str(kern)] = math.prod(kernel_param_lengths[kern])
# Print a summary of the selected script configurations
backslash = '\\'
newline = '\n'
print("******************************************\nSCRIPT CONFIGURATION SUMMARY OF VARIABLES:\n\n"
f"Debugging mode: {debug}\n"
f"Random number generator seed: {seed}\n"
f"Fixed general figure font: {fix_font}\n"
f"Font for feature importance figures: {imp_font}\n"
f"Figure max open warning set to: {fig_max_open_warning}\n"
f"Matplotlib figure style: {plot_style}\n"
f"Number of displayed pandas columns: {pandas_col_display_option}\n"
f"Selected figure format and dot-per-inches: {figure_format, figure_dpi}\n\n"
f"Current directory: {curr_dir.replace(backslash, '/')}\n"
f"Folder name prefix for this analysis: {folder_prefix}\n"
f"Training set absolute pathway: {train_path.replace(backslash, '/')}\n"
f"Test set absolute pathway: {test_path.replace(backslash, '/')}\n"
f"Target output feature: {output_feature}\n"
f"Names selected for the positive and negative classes respectively: {positive_class, negative_class}\n"
f"Features directly linked to the target: {output_related}\n\n"
f"Feature and threshold used to tag specific samples: {sample_tagging_feature, tag_threshold}\n"
f"Near-constant feature threshold: {thresh_near_constant}\n"
f"Data set splitting enabled based on splitting feature: {enable_data_split, split_feature}\n"
f"Prefix of engineered input features: {engineered_input_prefix}\n"
f"Enabled analysis of data subgroups: {enable_subgroups}\n"
f"The following subgroups will be analyzed: {subgroups_to_keep}\n"
f"Selected kernels for the SVM classifier: {kernels}\n"
f"With the following kernels being non linear: {non_linear_kernels}\n"
f"Cache size (mb) and decision function shape: {cache_size, decision_func_shape}\n"
f"Classifier and grid search verbosity: {clf_verbose, grid_verbose}\n"
f"Hard cap stopping criterion (max_iter): {hard_iter_cap}\n"
f"Number of stratified k fold CV split: {splits}\n"
f"Scorer selected for the analysis: {scorer}\n"
f"Feature importance shuffles for all, male and female data: {shuffle_all, shuffle_male, shuffle_female}\n"
f"Feature importance by permutation in case of linear classification: {linear_shuffle}\n"
f"Selected parallel backend method and number of jobs: {parallel_method, n_jobs}\n"
f"Removing features used for feature engineering enabled and selected feature prefixes: "
f"{enable_engineered_input_removal, engineered_input_prefix}\n"
f"Removing highly correlated features (RHCF) step enabled: {enable_rhcf}\n"
f"Correlation threshold for Cramer: {thresh_cramer}\n"
f"Correlation threshold for Point Bi-serial: {thresh_pbs}\n"
f"Correlation threshold for Spearman: {thresh_spearman}\n"
f"Resampling strategy and selected technique enabled: {enable_resampling, resampling_tech}\n"
f"Feature transformation (FT) step enabled: {enable_ft}\n"
f"Select k best technique for categorical features: {kbest_tech if kbest_tech != '' and enable_ft else None}\n"
f"Scaler technique for continuous variables selected: {scaler_tech + ' scaler'}\n"
f"PCA technique selected: {pca_tech.replace('_', ' ') if pca_tech != '' and enable_ft else None}\n"
f"If PCA is disabled, DA technique selected: {da_tech if da_tech != '' and enable_ft else None}\n"
f"Feature importance methods and visualizations enabled: {enable_feature_importance, feature_importance_method}\n"
f"Amount of most important features to show: {top_features}\n"
f"Box and bar plotting enabled and selected method: {enable_box_bar_plots, box_bar_figures}\n"
f"Order of steps in the pipeline if FT or resampling are enabled: {pipeline_order}\n"
f"Decision to drop or pass through features that are not transformed: {drop_or_pass_non_treated_features}\n"
f"Additional grid search parameters that are not directly supported: {additional_params}\n"
f"Additional technique parameters: {additional_technique_params}\n"
f"Additional kernel parameters: {additional_kernel_params}\n")
if enable_resampling & (resampling_tech == 'rus'):
with_or_without_sampling = 'with rus'
elif resampling_tech == 'smote':
with_or_without_sampling = 'with smote'
else:
with_or_without_sampling = 'without'
scale_only = 'including ' + scaler_tech + ' scaling, '
info_pca = \
(scale_only + (pca_tech.replace('_', ' ') + ', ' if pca_tech != '' else '') +
(str(kbest_tech) + ' select k best, ' if kbest_tech != '' else ''))
info_da = (scale_only + (da_tech + ', ' if da_tech != '' else '') +
(str(kbest_tech) + ' select k best, ' if kbest_tech != '' else ''))
for kern in kernels:
print(f"Total fitting for {kern} kernel, with {splits} fold cross-validation, {'with' if enable_ft else 'without'} "
f"feature transformation, "
f"{info_pca if pca_tech != '' else info_da if da_tech != '' else scale_only}\n"
f"{with_or_without_sampling} resampling, {'with' if enable_feature_importance else 'without'} feature "
f"importance, and {'with' if additional_params else 'without'} additional\nnon-supported grid search "
f"parameters: {total_fits[kern]}\n")
if pca_tech == 'kernel_pca' and len(kernel_pca_kernel_lpsr) > 1:
poly_fits = int(1/3 * total_fits[kern])
rbf_fits = int(1/3 * (int((1/len(kernel_pca_degree_lpsr) * 1/len(kernel_pca_coef0_lpsr))*total_fits[kern])))
sigmoid_fits = int(1/3 * (int((1/len(kernel_pca_degree_lpsr))*total_fits[kern])))
print(f"With kernelPCA enabled, the above calculated fits are masking the fact that each PCA kernel accepts "
f"different parameters.\nFor real, as 3 kernels are tested, only a third of above mentioned fits is "
f"tested for the poly kernelPCA.\nThe rbf and sigmoid kernels do accept less parameters, and therefore "
f"undergo less number of fits.\nThus, the total fits of this experiment are:\n"
f"{int(poly_fits + rbf_fits + sigmoid_fits)} total fits, with {int(poly_fits)} poly fits, {int(rbf_fits)}"
f" rbf fits, and {int(sigmoid_fits)} sigmoid fits.\n")
print(f"Overview of enabled grid search parameters:\n"
f"{newline.join(f'{key}: {value}' for key, value in total_params_and_splits.items())}\n\n"
f"Results folder pathway:\n{curr_dir.replace(backslash, '/') + '/' + folder_name}\n\n"
f"******************************************\n")
########################################################################
# ## Apply random seed, plot style, and pandas configuration parameters
########################################################################
# Random seed
random.seed(seed)
np.random.seed(seed)
# Plot styles
plt.rcParams['font.size'] = fix_font
plt.style.use(plot_style)
# Suppress figure max open warning from matplotlib
plt.rcParams.update({'figure.max_open_warning': fig_max_open_warning})
# Pandas option for number of displayed columns
pd.set_option('display.max_columns', pandas_col_display_option)
# Ensure threading on multiple devices
matplotlib.use('Agg' if (parallel_method == 'ipyparallel' or not debug) else 'TkAgg')
#####################################
# ## Loading the train and test data
#####################################
# Read in data as pandas dataframe
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
# Print the first rows
print('The first five rows of the original training set are:\n', train.head(5))
print('\nThe first five rows of the original test set are:\n', test.head(5))
# Print the shapes
print('\nThe shape of the original train set is:\n', train.shape)
print('The shape of the original test set is:\n', test.shape)
######################################
# ## Features and labels preparations
######################################
# collecting indices of samples to be tagged if given
print(f'\nTagging samples in the mixed train and test sets...')
sample_tag_idx_train, sample_tag_idx_test = \
tag_samples(train, test, sample_tagging_feature, tag_threshold)
# in sub samples if selected
if enable_data_split:
print(f'\nTagging samples in the female train and test sets...')
sample_tag_idx_male_train, sample_tag_idx_male_test = \
tag_samples(train[train[split_feature] == 1],
test[test[split_feature] == 1],
sample_tagging_feature, tag_threshold)
print(f'\nTagging samples in the female train and test sets...')
sample_tag_idx_female_train, sample_tag_idx_female_test = \
tag_samples(train[train[split_feature] == 2],
test[test[split_feature] == 2],
sample_tagging_feature, tag_threshold)
else:
sample_tag_idx_male_train, sample_tag_idx_male_test, \
sample_tag_idx_female_train, sample_tag_idx_female_test = 4 * [None]
# Remove features that were used to calculate the selected output feature
print(f'\nRemoving the following {len(output_related)} output related features:\n{output_related}')
train = train.drop(columns=[out for out in output_related])
test = test.drop(columns=[out for out in output_related])
print(f'\nRetaining the following subgroups: {subgroups_to_keep}')
if enable_subgroups and subgroups_to_keep != 'all':
tmp_feats = list(train.columns)
feats_to_keep = \
[tmp_feats[col] for col in range(len(tmp_feats)) if tmp_feats[col].startswith(subgroups_to_keep)]
train = train.drop(columns=[group for group in tmp_feats if group not in feats_to_keep])
test = test.drop(columns=[group for group in tmp_feats if group not in feats_to_keep])
print('\nThe shape of the train set with selected subgroups including output feature is:\n', train.shape)
print('The shape of the test set with selected subgroups including output feature is:\n', test.shape)
# Split the data based on the given split feature
if enable_data_split and split_feature in train.columns:
print(f'\nSplitting the data based on {split_feature} ...')
train_features, test_features, train_labels, test_labels, feature_list, \
train_men_features, test_men_features, train_men_labels, test_men_labels, \
train_female_features, test_female_features, train_female_labels, test_female_labels, \
feature_list_wo_gender = separate_full_data(full_train=train, full_test=test,
target_feature=output_feature, splitting_feature=split_feature)
else: # Continue with full data only, split feature will be turned to None
print(f'\nFull data analysis without data splitting, either because this step is disabled or\nbecause the feature '
f'to be split is no longer among the subgroups to keep.\nSubgroup activation: {enable_subgroups}, '
f'selected subgroups: {subgroups_to_keep}.')
train_features, test_features, train_labels, test_labels, feature_list = separate_full_data(
full_train=train, full_test=test, target_feature=output_feature, splitting_feature=split_feature)
# Pseudo call the below variables to be None to avoid IDE warnings
train_men_features, test_men_features, train_men_labels, test_men_labels, \
train_female_features, test_female_features, train_female_labels, test_female_labels, \
feature_list_wo_gender = [None] * 9
# Print the binary counts and ratio of negative and positive classes in the train and test sets
print(f'\n{negative_class.capitalize()}/{positive_class.capitalize()} counts in the full train set:',
np.bincount(train_labels), '\nratio:', round(np.bincount(train_labels)[0] / np.bincount(train_labels)[1], 3))
print(f'{negative_class.capitalize()}/{positive_class.capitalize()} counts in the full test set:',
np.bincount(test_labels), '\nratio:', round(np.bincount(test_labels)[0] / np.bincount(test_labels)[1], 3))
if enable_data_split:
print(f'\n{negative_class.capitalize()}/{positive_class.capitalize()} counts in the male train set:',
np.bincount(train_men_labels),
'\nratio:', round(np.bincount(train_men_labels)[0] / np.bincount(train_men_labels)[1], 3))
print(f'{negative_class.capitalize()}/{positive_class.capitalize()} counts in the male test set:',
np.bincount(test_men_labels),
'\nratio:', round(np.bincount(test_men_labels)[0] / np.bincount(test_men_labels)[1], 3))
print(f'\n{negative_class.capitalize()}/{positive_class.capitalize()} counts in the female train set:',
np.bincount(train_female_labels),
'\nratio:', round(np.bincount(train_female_labels)[0] / np.bincount(train_female_labels)[1], 3))
print(f'{negative_class.capitalize()}/{positive_class.capitalize()} counts in the female test set:',
np.bincount(test_female_labels),
'\nratio:', round(np.bincount(test_female_labels)[0] / np.bincount(test_female_labels)[1], 3))
#####################################
# ## Removing engineered input (REI)
#####################################
# Print the starting shapes
print('\nThe shape of the full train set before pre-processing and without output feature is:\n', train_features.shape)
print('The shape of the full test set before pre-processing and without output feature is:\n', test_features.shape)
if enable_data_split:
print('\nThe shape of the male train set before pre-processing and without output feature is:\n',
train_men_features.shape)
print('The shape of the male test set before pre-processing and without output feature is:\n',
test_men_features.shape)
print('\nThe shape of the female train set before pre-processing and without output feature is:\n',
train_female_features.shape)
print('The shape of the female test set before pre-processing and without output feature is:\n',
test_female_features.shape)
if enable_engineered_input_removal:
print("\nRemoving features that were used for feature engineering ...")
# Grouped medications GM and grouped devices GD (VitDDef BF or VitD_zscore_merged BF will be removed with RHCF)
engineered_input_feat = \
[feature_list[col] for col in range(len(feature_list)) if feature_list[col].startswith(engineered_input_prefix)]
# Remove those features from full data
engineered_input_idx_all = [col for col in range(len(feature_list)) if feature_list[col] in engineered_input_feat]
train_features = np.delete(train_features, engineered_input_idx_all, axis=1)
test_features = np.delete(test_features, engineered_input_idx_all, axis=1)
# Update feature list
feature_list = [x for x in feature_list if x not in engineered_input_feat]
# Remove those features from gender data
if enable_data_split:
engineered_input_sex = \
[col for col in range(len(feature_list_wo_gender)) if feature_list_wo_gender[col] in engineered_input_feat]
train_men_features = np.delete(train_men_features, engineered_input_sex, axis=1)
train_female_features = np.delete(train_female_features, engineered_input_sex, axis=1)
test_men_features = np.delete(test_men_features, engineered_input_sex, axis=1)
test_female_features = np.delete(test_female_features, engineered_input_sex, axis=1)
# Update feature list (at this point both genders still share feature_list_wo_gender)
feature_list_male = [x for x in feature_list_wo_gender if x not in engineered_input_feat]
feature_list_female = [x for x in feature_list_wo_gender if x not in engineered_input_feat]
else:
feature_list_male, feature_list_female = [None] * 2
# Print summary
print(f'The total of {len(engineered_input_feat)} sources of engineered features are removed to avoid high '
f'correlation.\n')
if len(engineered_input_feat) == 0:
print(f'If this step is enabled ({enable_engineered_input_removal}) and 0 sources were removed, '
f'this means that either no engineered prefix\nwas defined in the config script, '
f'or the considered features were not among the subgroups defined to keep.\n')
else: # If engineered features removal is disabled, male and female feature lists are still identical
feature_list = feature_list
if enable_data_split:
feature_list_male = feature_list_wo_gender # None if data splitting is removed
feature_list_female = feature_list_wo_gender # None if data splitting is removed
else:
feature_list_male, feature_list_female = [None] * 2
##############################################
# ## Checking for remaining constant features
##############################################
print("\nChecking and removing constant features in the full training set ...\n")
# Full data
constant_all = check_constant_features(feature_list, train_features, 'full', nbr_splits=splits,
near_constant_thresh=thresh_near_constant)
# Update the feature lists
feature_list = [feature_list[x] for x in range(len(feature_list)) if x not in constant_all]
# Remove those features
train_features = np.delete(train_features, constant_all, axis=1)
test_features = np.delete(test_features, constant_all, axis=1)
# Male and female data
if enable_data_split:
print("\nChecking and removing constant features in the male training set ...\n")
constant_male = check_constant_features(feature_list_male, train_men_features, 'male', nbr_splits=splits,
near_constant_thresh=thresh_near_constant)
print("\nChecking and removing constant features in the female training set ...\n")
constant_female = check_constant_features(feature_list_female, train_female_features, 'female', nbr_splits=splits,
near_constant_thresh=thresh_near_constant)
# Update gender feature list
feature_list_male = [feature_list_male[x] for x in range(len(feature_list_male)) if x not in constant_male]
feature_list_female = [feature_list_female[x] for x in range(len(feature_list_female)) if x not in constant_female]
# Remove those features
train_men_features = np.delete(train_men_features, constant_male, axis=1)
test_men_features = np.delete(test_men_features, constant_male, axis=1)
train_female_features = np.delete(train_female_features, constant_female, axis=1)
test_female_features = np.delete(test_female_features, constant_female, axis=1)
# Print the new shapes
print('\nThe shape of the full train set after pre-processing is:\n', train_features.shape)
print('The shape of the full test set after pre-processing is:\n', test_features.shape)
if enable_data_split:
print('\nThe shape of the male train set after pre-processing is:\n', train_men_features.shape)
print('The shape of the male test set after pre-processing is:\n', test_men_features.shape)
print('\nThe shape of the female train set after pre-processing is:\n', train_female_features.shape)
print('The shape of the female test set after pre-processing is:\n', test_female_features.shape)
###########################################################################
# ## Applying computation and removal of highly correlated features (RHCF)
###########################################################################
# If rhcf step is enabled, the different data type associations will be analyzed for their correlations
if enable_rhcf:
# Begin of the RHCF application
print(f"\nBeginning to remove highly correlated features (RHCF) with the following associations and thresholds:")
print(f"Categorical-categorical association with corrected Cramer's V {thresh_cramer} threshold.")
print(f"Continuous-continuous association with Spearman's Rank Order Correlation {thresh_spearman} threshold.")
print(f"Categorical-continuous association with Point Bi-Serial Correlation {thresh_pbs} threshold.\n")
# Full data
# Get the initial categorical and continuous indices
continuous_idx, categorical_idx = get_cat_and_cont(train_features, test_features)
if len(continuous_idx) < 1:
print('The correlation step for continuous-related associations must be skipped because there are no '
'continuous features left in the data sets after potentially removing subgroups and engineered features.')
elif len(categorical_idx) < 1:
print('The correlation step for categorical-related associations must be skipped because there are no '
'categorical features left in the data sets after potentially removing subgroups '
'and engineered features.')
print('Starting to analyze and remove highly correlated features in the full data set ...\n')
# Corrected Cramer's V correlation
if len(categorical_idx) > 1:
cramer_res, cat_to_drop, cramer_set = applied_cat_rhcf(
parallel_meth=parallel_method, training_features=train_features, features_list=feature_list,
categorical=categorical_idx, n_job=n_jobs, cramer_threshold=thresh_cramer, directory=curr_dir,
folder=folder_name, datatype='full')
# Heatmap of the cramer matrix (saving process inside function)
cramer_heatmap(cramer_res, thresh_cramer, 'full', categorical_idx, folder_name, figure_dpi, figure_format)
else:
cat_to_drop, cramer_set = [], set()
# Spearman correlation
if len(continuous_idx) > 1:
spearman_res, cont_to_drop, spearman_set = applied_cont_rhcf(training_features=train_features,
features_list=feature_list,
continuous=continuous_idx,
spearman_threshold=thresh_spearman,
directory=curr_dir, folder=folder_name,
datatype='full')
# Heatmap of the spearman matrix (saving process inside function)
spearman_heatmap(spearman_res, thresh_spearman, 'full', continuous_idx, folder_name, figure_dpi, figure_format)
else:
cont_to_drop, spearman_set = [], set()
# General data update after continuous and categorical correlation features were identified
rem_train, rem_test, rem_feat, rem_idx = drop_and_update_correlated_data(
continuous_to_drop=cont_to_drop, categorical_to_drop=cat_to_drop, training_set=train_features,
test_set=test_features, features_list=feature_list)
# Point bi-serial correlation
if len(categorical_idx) >= 1 and len(continuous_idx) >= 1:
longest, res_pb_r, res_pb_pv, pbs_to_drop, pbs_set, rem_cont, rem_cat = applied_cat_cont_rhcf(
parallel_meth=parallel_method, training_features=train_features, cont_before_rhcf=continuous_idx,
cat_before_rhcf=categorical_idx, features_list=feature_list, feat_after_rhcf=rem_feat,
feat_idx_after_rhcf=rem_idx, n_job=n_jobs, pbs_threshold=thresh_pbs, directory=curr_dir, folder=folder_name,
datatype='full')
# Heatmap of the point bi-serial matrix (saving process inside function)
pbs_heatmap(res_pb_r, thresh_pbs, 'full', rem_cat, rem_cont, longest, folder_name, figure_dpi, figure_format)
else:
pbs_to_drop, rem_cat, rem_cont, pbs_set = [], [], [], set()
# Generate the final remaining training set, test set, and feature list
final_train_features, final_test_features, final_feature_list = final_drop_and_update(
point_bs_to_drop=pbs_to_drop, training_set=rem_train, test_set=rem_test, features_list=rem_feat)
# RHCF summary prints and general data update
original_train, original_test, original_feat, train_features, test_features, feature_list = rhcf_update_summary(
training_features=train_features, testing_features=test_features, features_list=feature_list,
fin_train=final_train_features, fin_test=final_test_features, fin_feat=final_feature_list, datatype='full',
cat_drop=cat_to_drop, cont_drop=cont_to_drop, pbs_drop=pbs_to_drop, cont_idx=continuous_idx,
cat_idx=categorical_idx, remain_cont=rem_cont, remain_cat=rem_cat, cramer_threshold=thresh_cramer,
spearman_threshold=thresh_spearman, pbs_threshold=thresh_pbs, remaining_train=rem_train,
remaining_test=rem_test)
# Male data
if enable_data_split:
# Get the initial categorical and continuous indices
continuous_idx_male, categorical_idx_male = get_cat_and_cont(train_men_features, test_men_features)
if len(continuous_idx_male) < 1:
print('The correlation step for continuous-related associations must be skipped because there are no '
'continuous features left in the data sets after potentially removing subgroups and engineered '
'features.')
elif len(categorical_idx_male) < 1:
print('The correlation step for categorical-related associations must be skipped because there are no '
'categorical features left in the data sets after potentially removing subgroups '
'and engineered features.')
print('Starting to analyze and remove highly correlated features in the male data set ...\n')
# Corrected Cramer's V correlation
if len(categorical_idx_male) > 1:
cramer_res_male, cat_to_drop_male, cramer_set_male = applied_cat_rhcf(
parallel_meth=parallel_method, training_features=train_men_features, features_list=feature_list_male,
categorical=categorical_idx_male, n_job=n_jobs, cramer_threshold=thresh_cramer, directory=curr_dir,
folder=folder_name, datatype='male')
# Heatmap of the male cramer matrix (saving process inside function)
cramer_heatmap(cramer_res_male, thresh_cramer, 'male', categorical_idx_male, folder_name, figure_dpi,
figure_format)
else:
cat_to_drop_male, cramer_set_male = [], set()
# Spearman correlation
if len(continuous_idx_male) > 1:
spearman_res_male, cont_to_drop_male, spearman_set_male = applied_cont_rhcf(
training_features=train_men_features, features_list=feature_list_male, continuous=continuous_idx_male,
spearman_threshold=thresh_spearman, directory=curr_dir, folder=folder_name, datatype='male')
# Heatmap of the male spearman matrix (saving process inside function)
spearman_heatmap(spearman_res_male, thresh_spearman, 'male', continuous_idx_male, folder_name,
figure_dpi, figure_format)
else:
cont_to_drop_male, spearman_set_male = [], set()
# General data update after continuous and categorical correlation features were identified
rem_train_male, rem_test_male, rem_feat_male, rem_idx_male = drop_and_update_correlated_data(
continuous_to_drop=cont_to_drop_male, categorical_to_drop=cat_to_drop_male, training_set=train_men_features,
test_set=test_men_features, features_list=feature_list_male)
# Point bi-serial correlation
if len(categorical_idx_male) >= 1 and len(continuous_idx_male) >= 1:
longest_male, res_pb_r_male, res_pb_pv_male, pbs_to_drop_male, pbs_set_male, \
rem_cont_male, rem_cat_male = applied_cat_cont_rhcf(
parallel_meth=parallel_method, training_features=train_men_features,
cont_before_rhcf=continuous_idx_male, cat_before_rhcf=categorical_idx_male,
features_list=feature_list_male, feat_after_rhcf=rem_feat_male, feat_idx_after_rhcf=rem_idx_male,
n_job=n_jobs, pbs_threshold=thresh_pbs, directory=curr_dir, folder=folder_name, datatype='male')
# Heatmap of the male point bi-serial matrix (saving process inside function)
pbs_heatmap(res_pb_r_male, thresh_pbs, 'male', rem_cat_male, rem_cont_male, longest_male, folder_name,
figure_dpi, figure_format)
else:
pbs_to_drop_male, rem_cont_male, rem_cat_male, pbs_set_male = [], [], [], set()
# Generate the final remaining training set, test set, and feature list
final_train_features_male, final_test_features_male, final_feature_list_male = final_drop_and_update(
point_bs_to_drop=pbs_to_drop_male, training_set=rem_train_male, test_set=rem_test_male,
features_list=rem_feat_male)
# RHCF summary prints and general data update
original_train_male, original_test_male, original_feat_male, train_men_features, test_men_features, \
feature_list_male = rhcf_update_summary(
training_features=train_men_features, testing_features=test_men_features,
features_list=feature_list_male, fin_train=final_train_features_male, fin_test=final_test_features_male,
fin_feat=final_feature_list_male, datatype='male', cat_drop=cat_to_drop_male,
cont_drop=cont_to_drop_male, pbs_drop=pbs_to_drop_male, cont_idx=continuous_idx_male,
cat_idx=categorical_idx_male, remain_cont=rem_cont_male, remain_cat=rem_cat_male,
cramer_threshold=thresh_cramer, spearman_threshold=thresh_spearman, pbs_threshold=thresh_pbs,
remaining_train=rem_train_male, remaining_test=rem_test_male)
# Female data
# Get the initial categorical and continuous indices
continuous_idx_female, categorical_idx_female = get_cat_and_cont(train_female_features, test_female_features)