-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthesis_wrapper.py
80 lines (60 loc) · 3.52 KB
/
synthesis_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# coding: utf-8
# Author: Mandis Beigi
# Copyright (c) 2022 Medidata Solutions, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import logging
import k_anonymity
import synthesis_lib
import preprocessor_lib
import utilities_lib
def synthesize(df, config):
logging.info('Performing k-anonymity to the data......................')
logging.info('The data size before k-anonymity: {}'.format(df.shape))
ignore_columns = utilities_lib.get_date_columns(df)
df = k_anonymity.perform_k_anonymity(df, config.anonymity_k, ignore_columns)
logging.info('The data size after k-anonymity: {}'.format(df.shape))
ignore_columns = utilities_lib.get_date_columns(df)
tmp_df = df.loc[:, ~df.columns.isin(ignore_columns)]
label_encoded_df, encoding_dict = preprocessor_lib.label_encoding_encode(tmp_df)
label_encoded_df = preprocessor_lib.impute_label_encoded_df(label_encoded_df)
corr_cols_groups = synthesis_lib.generate_corr_cols_groups(label_encoded_df, config.corr_threshold)
col_pairings = utilities_lib.merge_2d_lists(corr_cols_groups, config.col_pairings)
one_hot_encoded_df = preprocessor_lib.one_hot_encoding_encode(tmp_df)
logging.info("encoded_df: {}".format(one_hot_encoded_df.shape))
encoded_df = one_hot_encoded_df
logging.info('Synthesizing the data data.............................')
syn_encoded_df = synthesis_lib.synthesize(encoded_df,
method=config.embedding_method, metric=config.embedding_metric,
min_cluster_size=config.min_cluster_size, max_cluster_size=config.max_cluster_size,
batch_size=config.batch_size, corr_thresh=config.corr_threshold, include_outliers=config.include_outliers,
holdout_cols=config.holdout_cols, derived_cols_dict={}, col_pairings=col_pairings,
imputing_method=config.imputing_method, add_noise=config.add_noise)
logging.info("syn_encoded_df: {}".format(syn_encoded_df.shape))
logging.info('Decoding the synthesized data...............................')
syn_encoded_df_no_index = syn_encoded_df.reset_index(drop=False)
syn_df = preprocessor_lib.one_hot_encoding_decode(syn_encoded_df_no_index)
logging.info('Saving the synthesized data.....................................')
logging.info('syn_df: {}'.format(syn_df.shape))
df = df.reset_index(drop=False)
df_columns = utilities_lib.intersection(df.columns, syn_df.columns)
syn_df = syn_df.reindex(columns=df_columns)
syn_df.to_csv(config.output_dir+config.proj_name+'_syn.csv', index=False)
return(syn_df)