From 755aee3142bae6c830debb4736e0b95a35946578 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Mon, 9 Jan 2023 11:06:01 +0100 Subject: [PATCH 01/30] . --- .../memilio/epidata/assessNPIEffects.py | 481 ++++++++++++++++++ 1 file changed, 481 insertions(+) create mode 100644 pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py new file mode 100644 index 0000000000..bcd411494f --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -0,0 +1,481 @@ +############################################################################# +# Copyright (C) 2020-2023 German Aerospace Center (DLR-SC) +# +# Authors: Martin J. Kuehn +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# +import os +import sys +import time +from datetime import datetime, timedelta + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from memilio.epidata import customPlot +from memilio.epidata import defaultDict as dd +from memilio.epidata import geoModificationGermany as geoger +from memilio.epidata import getDataIntoPandasDataFrame as gd +from scipy.cluster import hierarchy +from scipy.spatial.distance import pdist + + +def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): + """! Computes a score for a particular clustering based on the + correlation matrix. The score is computed as the percentage of 'higher' + or 'high' values (e.g., between 0.5 and 0.75 or 0.75 and 1) of the + correlation matrix that are to be found in the diagonal blocks of the + clustered correlation matrix vs these values in the offdiagonal blocks. + + @param corr_mat correlation matrix between the features / data set items + that were clustered. + @param idx_to_cluster_idx Mapping of data item to cluster index. + @param indices_all List of indices of all data items. + + @return Scores for the provided clustering. + """ + + if idx_to_cluster_idx.min() == 1: + idx_to_cluster_idx -= 1 + + # store indices of clusters + clusters = [[] for i in range(idx_to_cluster_idx.max()+1)] + for ii in range(len(idx_to_cluster_idx)): + clusters[idx_to_cluster_idx[ii]].append(ii) + # store remaining/perpendicular indices for all clusters + clusters_perp = [[] for i in range(idx_to_cluster_idx.max()+1)] + for ii in range(len(clusters)): + clusters_perp[ii] = list(indices_all.difference(set(clusters[ii]))) + # extract correlation values of block diagonals and offdiagonals separ. + corr_diag = [] + corr_offdiag = [] + for ii in range(len(clusters)): + corr_diag = np.append(corr_diag, abs( + corr_mat[np.ix_(clusters[ii], clusters[ii])].flatten())) + corr_offdiag = np.append(corr_offdiag, abs( + corr_mat[np.ix_(clusters[ii], clusters_perp[ii])].flatten())) + + corr_thresholds = [0.25, 0.5, 0.75] + cluster_quantification = np.zeros(6) + for ii in range(len(corr_thresholds)): + num_diag = len(np.where(corr_diag > corr_thresholds[ii])[0]) + num_offdiag = len(np.where(corr_offdiag > corr_thresholds[ii])[0]) + if ii < len(corr_thresholds)-1: + num_diag -= len(np.where(corr_diag > corr_thresholds[ii+1])[0]) + num_offdiag -= len(np.where(corr_offdiag > + corr_thresholds[ii+1])[0]) + cluster_quantification[2*ii] = num_diag / (num_diag+num_offdiag) + cluster_quantification[2*ii+1] = ( + num_diag+num_offdiag) / (len(indices_all)**2) + + # print scores on clustering + print("Number of clusters: " + str(len(clusters)) + + ", shares diag/all between [0.25, 0.5, 0.75]: %.4f" % + cluster_quantification[0] + " (%.4f" % cluster_quantification[1] + + "), " + " %.4f " % cluster_quantification[2] + " (%.4f" % + cluster_quantification[3] + "), " + " %.4f " % + cluster_quantification[4] + " (%.4f" % cluster_quantification[5] + ")") + + return cluster_quantification + + +def compute_hierarch_clustering(corr_mat, corr_pairwdist, + metrics=['single', 'complete', 'average', + 'weighted', 'centroid', 'median', + 'ward']): + """! Computes a hierarchical clustering for a (list of) metric(s) and + provides the maximum cophenetic distance(s) as well as a score for the + clustering (see @method evaluate_clustering(...)). + + @param corr_mat correlation matrix between the features / data set items + to be clustered hierarchically. + @param corr_pairwdist Computed pairwise distance between the features / data + set items. + @param metric Metric or list of metrics to compute the hierarchical + clustering. + + @return (List of) hierarchical clustering(s), maximum cophenetic distance(s) + and scores of the hierarchical clustering. + """ + # NOTE: if changing metric, pay attention to linkage methods; + # 'centroid', 'median', and 'ward' are correctly defined only if + # Euclidean pairwise metric is used. + # Based on the distances, we compute an hierarchical clustering for + # different metrics + max_coph_corr = 0 + scores = dict() + # allow single entry + if not isinstance(metrics, list): + metrics = [metrics] + # iterate over list + for metric in metrics: + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=metric) + # compute cophentic correlation distance + coph_corr, coph_dists = hierarchy.cophenet( + cluster_hierarch, pdist(corr_mat)) + scores[metric] = coph_corr + if coph_corr > max_coph_corr: + max_coph_corr = coph_corr + max_metric = metric + max_coph_dist = coph_dists + + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_metric) + + print( + "Cophentic correlation distance for metric " + max_metric + ": " + + str(max_coph_corr)) + + return cluster_hierarch, max_coph_dist, scores + + +def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): + """! Flattens a hierarchical clustering for a (list of) maximum cophenetic + distance(s) in the flat clusters and evaluates the resulting clustering with + respect to the corresponding correlation matrix. + + @param corr_mat correlation matrix between the features / data set items + clustered hierarchically. + @param cluster_hierarch hierarchical clustering of given features / data + set items. + @param weigths Maximum cophenetic distance or list of maximum cophenetic + distances to compute the flat clustering(s). + + @return flat clustering(s) according to the (list of) maximum distance(s). + """ + + # all indices in npis_corr from 0 to n-1 + npi_indices_all = set(range(corr_mat.shape[0])) + npi_idx_to_cluster_idx_list = [] + # allow single entries + if not isinstance(weights, list): + weights = [weights] + # iterate over weights + for weight in weights: + # use the given weight to flatten the dendrogram + npi_idx_to_cluster_idx = hierarchy.fcluster( + cluster_hierarch, weight, criterion='distance') + + # evaluate clustering + evaluate_clustering(corr_mat, npi_idx_to_cluster_idx, npi_indices_all) + + # append new npi_idx to cluster_idx assignment to list of assignments + npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) + + return npi_idx_to_cluster_idx_list + + +def analyze_npi_data( + read_data, make_plot, fine_resolution, npis, directory, file_format, + npi_codes_considered): + + if not read_data: + x = 15 + # transform_npi_data(fine_resolution=2, + # file_format=dd.defaultDict['file_format'], + # out_folder=dd.defaultDict['out_folder'], + # start_date=dd.defaultDict['start_date'], + # end_date=dd.defaultDict['end_date'], + # make_plot=dd.defaultDict['make_plot'], + # ) + + else: # read formatted file + + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_incgrouped' + else: + filename = 'germany_counties_npi_subcat' + else: + filename = 'germany_counties_npi_maincat' + df_npis = pd.read_json(directory + filename + ".json") + # get code levels (main/subcodes) and position of main codes + # code_level = [i.count('_') for i in npi_codes] + # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] + + # check if any other integer than 0: not implemented or 1: implemented is + # used (maybe to specify the kind of implementation) + if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: + + print("Info: Please ensure that NPI information is only boolean.") + + else: + # sum over different NPIs and plot share of countires implementing + # these NPIs versus counties without corresponding actions + df_npis_aggregated = df_npis.groupby( + dd.EngEng['date']).agg( + {i: sum for i in npi_codes_considered}).copy() + npis_total_sum = df_npis_aggregated.sum() + + npi_codes_empty = list(np.array(npi_codes_considered)[ + np.where(npis_total_sum == 0)[0]]) + + npi_unused_indices_all = [] + npi_used_indices_all = [] + npi_unused_indices = [] + npi_used_indices = [] + for i in range(len(npi_codes_considered)): + if npi_codes_considered[i] in npi_codes_empty: + npi_unused_indices.append(i) + npi_unused_indices_all.append( + npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) + else: + npi_used_indices.append(i) + npi_used_indices_all.append( + npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) + + npis_unused = np.array(npis[dd.EngEng['desc']])[npi_unused_indices_all] + npis_used = np.array(npis[dd.EngEng['desc']])[npi_used_indices_all] + npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) + npi_codes_unused = list( + np.array(npi_codes_considered)[npi_unused_indices]) + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'unused_subcats_incgrouped.txt' + else: + filename = 'unused_subcats.txt' + else: + filename = 'unused_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(npis_unused)): + file_npi.write(npi_codes_unused[i] + ": " + npis_unused[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'used_subcats_incgrouped.txt' + else: + filename = 'used_subcats.txt' + else: + filename = 'used_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(npis_used)): + file_npi.write(npi_codes_used[i] + ": " + npis_used[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + df_npis_used = df_npis[[dd.EngEng['date'], + dd.EngEng['idCounty']] + npi_codes_used].copy() + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_used_incgrouped' + else: + filename = 'germany_counties_npi_subcat_used' + else: + filename = 'germany_counties_npi_maincat_used' + gd.write_dataframe(df_npis_used, directory, filename, file_format) + + # compute correlations + npis_corr = df_npis_used.iloc[:, 2:].corr().values + # plot log-colored correlations + plt.imshow(abs(npis_corr), cmap='gray_r') + # plot histogram + plt.figure() + plt.hist(npis_corr.flatten(), bins=50) + plt.title("Correlation histogram", fontsize=18) + plt.xlabel("Correlation", fontsize=12) + plt.ylabel("Number of values", fontsize=12) + + # We understand the rows of npis_corr, the correlations of one NPI + # to the others as one node in the #NPIs-used-dimensional space. + # We compute the pairwise distances of these nodes. Then, nodes with + # similar correlations towards all other nodes exhibit small distances + corr_pairwdist = hierarchy.distance.pdist( + npis_corr, metric='euclidean') + + # compute hierarchical clustering (via best-suited metric) + compare_metrics = True + if compare_metrics: + # centroid + metric = 'centroid' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + abs(npis_corr), + corr_pairwdist, + metric) + # # plot dendrogram + plt.figure() + plt.title(metric) + hierarchy.dendrogram(cluster_hierarch) + plt.show() + max_coph_dist = coph_dist.max() + flatten_hierarch_clustering( + abs(npis_corr), cluster_hierarch, + [wg * max_coph_dist + for wg in [0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75]]) + # ward + metric = 'ward' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + npis_corr, + corr_pairwdist, + metric) + # # plot dendrogram + # plt.figure() + # plt.title(metric) + # hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist.max() + flatten_hierarch_clustering( + abs(npis_corr), cluster_hierarch, + [wg * max_coph_dist for wg in [0.1, 0.125, 0.15, 0.175, 0.2]]) + # average + metric = 'average' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + abs(npis_corr), + corr_pairwdist, + metric) + # # plot dendrogram + # plt.figure() + # plt.title(metric) + # hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist.max() + flatten_hierarch_clustering( + npis_corr, cluster_hierarch, + [wg * max_coph_dist + for wg in [0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65]]) + + metric = 'centroid' + cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + npis_corr, + corr_pairwdist, + metric) + # # plot dendrogram + # plt.figure() + # plt.title(metric) + # hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist.max() + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr, cluster_hierarch, + [wg * max_coph_dist + for wg in [0.65]]) + + cluster_dict = dict() + cluster_codes = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] + cluster_desc = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] + for i in range(len(npi_idx_to_cluster_idx[0])): + cluster_dict[npi_codes_used[i] + ] = "CM_" + str(npi_idx_to_cluster_idx[0][i]).zfill(3) + cluster_codes[npi_idx_to_cluster_idx[0] + [i]].append(npi_codes_used[i]) + cluster_desc[npi_idx_to_cluster_idx[0] + [i]].append(str(npis_used[i])) + + # create clustered dataframe + df_npis_clustered = df_npis[[ + dd.EngEng['date'], dd.EngEng['idCounty']]].copy() + + for i in range(len(cluster_codes)): + df_npis_clustered["CM_" + str(i).zfill(3) + ] = df_npis[cluster_codes[i]].max(axis=1).copy() + + npis_corr_cluster = df_npis_clustered.corr() + # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 + plt.imshow(abs(npis_corr_cluster), cmap='gray_r') + plt.title('Absolute correlation>0.25 of clustered NPIs') + plt.xlabel('NPI cluster') + plt.ylabel('NPI cluster') + plt.colorbar() + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'clusters_subcats_incgrouped.txt' + else: + filename = 'clusters_subcats.txt' + else: + filename = 'clusters_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(cluster_codes)): + file_npi.write("Cluster " + str(i) + "\n") + for j in range(len(cluster_codes[i])): + file_npi.write(cluster_codes[i][j] + ": " + cluster_desc[i][j]) + file_npi.write("\n") + file_npi.write("\n") + # Closing file + file_npi.close() + + npi_idx_new = np.argsort(npi_idx_to_cluster_idx[0]) + npis_corr_reorder = npis_corr[npi_idx_new, :][:, npi_idx_new] + + plt.imshow(abs(npis_corr_reorder), cmap='gray_r') + plt.colorbar() + + # npi_indices_all = set(range(npis_corr.shape[0])) + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_npis = KMeans(n_clusters=i).fit(df_npis_used.iloc[:,2:].T) + # evaluate_clustering(npis_corr, kmeans_npis.labels_, npi_indices_all) + + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_corr = KMeans(n_clusters=i).fit(npis_corr) + # evaluate_clustering(npis_corr, kmeans_corr.labels_, npi_indices_all) + + # corr_threshold = 0.5 + # corr_indices_threshold = np.where(npis_corr > corr_threshold) + # npis_corr_threshold = np.zeros(npis_corr.shape) + # npis_corr_threshold[corr_indices_threshold] = npis_corr[corr_indices_threshold] + # plt.imshow(npis_corr_threshold, cmap='gray_r') + + # plot share of counties that implement the main categories + if make_plot: + # plot four different subsets of curves for better distinction + j = 0 + if fine_resolution > 0: + num_images = 15 + else: + num_images = 1 + for i in [ + slice( + int(len(npi_codes_used) / num_images) * i, + min( + int(len(npi_codes_used) / num_images) * + (i + 1), + len(npis_used))) for i in range( + num_images + 1)]: + customPlot.plotList(df_npis_aggregated.index, + [df_npis_aggregated[code] + for code in npi_codes_used[i]], + npis_used[i], + 'Counties implementing NPI main categories', + 'Date', 'Number', "Counties_NPI_main_" + + str(j) + "_of_"+str(num_images)) + j += 1 + + +def main(): + """! Main program entry.""" + + # arg_dict = gd.cli("testing") + fine_resolution = 2 + npis_final = [] + directory = [] + file_format = [] + npi_codes_considered = [] + analyze_npi_data(True, True, fine_resolution, npis_final, + directory, file_format, npi_codes_considered) + + +if __name__ == "__main__": + + main() From edfcdedf65170fd44d561ed0657c20ce29e90205 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 23 May 2023 13:08:48 +0200 Subject: [PATCH 02/30] fix some errors --- .../memilio/epidata/assessNPIEffects.py | 38 +++++++++++++------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index bcd411494f..50542e44a4 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -192,7 +192,10 @@ def analyze_npi_data( # ) else: # read formatted file - + npis = pd.read_excel( + os.path.join( + directory, 'datensatzbeschreibung_massnahmen.xlsx'), + sheet_name=3, engine='openpyxl') if fine_resolution > 0: if fine_resolution == 1: filename = 'germany_counties_npi_subcat_incgrouped' @@ -200,21 +203,32 @@ def analyze_npi_data( filename = 'germany_counties_npi_subcat' else: filename = 'germany_counties_npi_maincat' - df_npis = pd.read_json(directory + filename + ".json") + df_npis = pd.read_csv(directory + filename + ".csv") + try: + df_npis = df_npis.drop('Unnamed: 0', axis=1) + except KeyError: + pass + npis = pd.read_json(os.path.join(directory, 'npis.json')) + all_codes = [] + # merge subcodes of considered npis (Do we want this? To discuss...) + df_merged = df_npis.iloc[:, :2].copy() + for subcode in npi_codes_considered: + df_merged[subcode] = df_npis.filter(regex=subcode).sum(axis=1) + all_codes.append(df_npis.filter(regex=subcode).columns) # get code levels (main/subcodes) and position of main codes # code_level = [i.count('_') for i in npi_codes] # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] - # check if any other integer than 0: not implemented or 1: implemented is - # used (maybe to specify the kind of implementation) - if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: + # check if any other integer than 0: not implemented or 1: implemented is + # used (maybe to specify the kind of implementation) + if len(np.where(df_merged[npi_codes_considered] > 1)[0]) > 0: print("Info: Please ensure that NPI information is only boolean.") else: # sum over different NPIs and plot share of countires implementing # these NPIs versus counties without corresponding actions - df_npis_aggregated = df_npis.groupby( + df_npis_aggregated = df_merged.groupby( dd.EngEng['date']).agg( {i: sum for i in npi_codes_considered}).copy() npis_total_sum = df_npis_aggregated.sum() @@ -230,14 +244,14 @@ def analyze_npi_data( if npi_codes_considered[i] in npi_codes_empty: npi_unused_indices.append(i) npi_unused_indices_all.append( - npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) + np.where(npis[dd.EngEng['npiCode']] == 'M01a_010')[0][0]) else: npi_used_indices.append(i) npi_used_indices_all.append( - npis[dd.EngEng['npiCode']].index(npi_codes_considered[i])) + np.where(npis[dd.EngEng['npiCode']] == 'M01a_010')[0][0]) - npis_unused = np.array(npis[dd.EngEng['desc']])[npi_unused_indices_all] - npis_used = np.array(npis[dd.EngEng['desc']])[npi_used_indices_all] + npis_unused = np.array(npis['Description'])[npi_unused_indices_all] + npis_used = np.array(npis['Description'])[npi_used_indices_all] npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) npi_codes_unused = list( np.array(npi_codes_considered)[npi_unused_indices]) @@ -469,8 +483,8 @@ def main(): # arg_dict = gd.cli("testing") fine_resolution = 2 npis_final = [] - directory = [] - file_format = [] + directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') + file_format = 'json' npi_codes_considered = [] analyze_npi_data(True, True, fine_resolution, npis_final, directory, file_format, npi_codes_considered) From f6a9cc4121206db70c5883ddefe91359dc8b4cf7 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Fri, 30 Jun 2023 12:46:28 +0200 Subject: [PATCH 03/30] changes --- .../memilio/epidata/assessNPIEffects.py | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 50542e44a4..43c5f46042 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -18,16 +18,12 @@ # limitations under the License. ############################################################################# import os -import sys -import time -from datetime import datetime, timedelta import matplotlib.pyplot as plt import numpy as np import pandas as pd from memilio.epidata import customPlot from memilio.epidata import defaultDict as dd -from memilio.epidata import geoModificationGermany as geoger from memilio.epidata import getDataIntoPandasDataFrame as gd from scipy.cluster import hierarchy from scipy.spatial.distance import pdist @@ -123,7 +119,7 @@ def compute_hierarch_clustering(corr_mat, corr_pairwdist, # iterate over list for metric in metrics: cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=metric) - # compute cophentic correlation distance + # compute cophenetic correlation distance coph_corr, coph_dists = hierarchy.cophenet( cluster_hierarch, pdist(corr_mat)) scores[metric] = coph_corr @@ -182,7 +178,7 @@ def analyze_npi_data( npi_codes_considered): if not read_data: - x = 15 + raise FileNotFoundError('') # transform_npi_data(fine_resolution=2, # file_format=dd.defaultDict['file_format'], # out_folder=dd.defaultDict['out_folder'], @@ -209,26 +205,24 @@ def analyze_npi_data( except KeyError: pass npis = pd.read_json(os.path.join(directory, 'npis.json')) - all_codes = [] # merge subcodes of considered npis (Do we want this? To discuss...) - df_merged = df_npis.iloc[:, :2].copy() - for subcode in npi_codes_considered: - df_merged[subcode] = df_npis.filter(regex=subcode).sum(axis=1) - all_codes.append(df_npis.filter(regex=subcode).columns) # get code levels (main/subcodes) and position of main codes # code_level = [i.count('_') for i in npi_codes] # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] # check if any other integer than 0: not implemented or 1: implemented is # used (maybe to specify the kind of implementation) - if len(np.where(df_merged[npi_codes_considered] > 1)[0]) > 0: + + npi_codes_considered = npis.NPI_code.values.tolist()#[x for x in npis.NPI_code if len(x) <=8] + + if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: print("Info: Please ensure that NPI information is only boolean.") else: # sum over different NPIs and plot share of countires implementing # these NPIs versus counties without corresponding actions - df_npis_aggregated = df_merged.groupby( + df_npis_aggregated = df_npis.groupby( dd.EngEng['date']).agg( {i: sum for i in npi_codes_considered}).copy() npis_total_sum = df_npis_aggregated.sum() @@ -250,8 +244,8 @@ def analyze_npi_data( npi_used_indices_all.append( np.where(npis[dd.EngEng['npiCode']] == 'M01a_010')[0][0]) - npis_unused = np.array(npis['Description'])[npi_unused_indices_all] - npis_used = np.array(npis['Description'])[npi_used_indices_all] + npis_unused = np.array(npis['Description'])[npi_unused_indices_all] # for all fr1 codes: len=153 + npis_used = np.array(npis['Description'])[npi_used_indices_all] # for all fr1 codes: len=14 npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) npi_codes_unused = list( np.array(npi_codes_considered)[npi_unused_indices]) @@ -302,7 +296,7 @@ def analyze_npi_data( # compute correlations npis_corr = df_npis_used.iloc[:, 2:].corr().values # plot log-colored correlations - plt.imshow(abs(npis_corr), cmap='gray_r') + plt.imshow(abs(npis_corr), cmap='flag') # plot histogram plt.figure() plt.hist(npis_corr.flatten(), bins=50) @@ -370,7 +364,7 @@ def analyze_npi_data( metric = 'centroid' cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( - npis_corr, + npis_corr, # same result as abs(npis_corr) corr_pairwdist, metric) # # plot dendrogram @@ -382,7 +376,7 @@ def analyze_npi_data( npi_idx_to_cluster_idx = flatten_hierarch_clustering( npis_corr, cluster_hierarch, [wg * max_coph_dist - for wg in [0.65]]) + for wg in [0.50]]) # 10 cluster cluster_dict = dict() cluster_codes = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] @@ -485,7 +479,7 @@ def main(): npis_final = [] directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') file_format = 'json' - npi_codes_considered = [] + npi_codes_considered = ['M01a_010', 'M01a_020'] analyze_npi_data(True, True, fine_resolution, npis_final, directory, file_format, npi_codes_considered) From d3b4b064a91e5b90496194c5a10bbbf45326ec54 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Fri, 30 Jun 2023 12:57:59 +0200 Subject: [PATCH 04/30] change heatmap --- pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 43c5f46042..e9a51c8790 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -296,7 +296,7 @@ def analyze_npi_data( # compute correlations npis_corr = df_npis_used.iloc[:, 2:].corr().values # plot log-colored correlations - plt.imshow(abs(npis_corr), cmap='flag') + plt.imshow(abs(npis_corr), cmap='gray_r') # plot histogram plt.figure() plt.hist(npis_corr.flatten(), bins=50) From 09e9279be30917ab904ba887cde0624852e1b868 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 3 Jul 2023 09:54:24 +0200 Subject: [PATCH 05/30] cluster quantification --- .../memilio/epidata/assessNPIEffects.py | 61 +++++++++---------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index e9a51c8790..d3fb86d2cb 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -64,28 +64,16 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): corr_offdiag = np.append(corr_offdiag, abs( corr_mat[np.ix_(clusters[ii], clusters_perp[ii])].flatten())) - corr_thresholds = [0.25, 0.5, 0.75] - cluster_quantification = np.zeros(6) - for ii in range(len(corr_thresholds)): - num_diag = len(np.where(corr_diag > corr_thresholds[ii])[0]) - num_offdiag = len(np.where(corr_offdiag > corr_thresholds[ii])[0]) - if ii < len(corr_thresholds)-1: - num_diag -= len(np.where(corr_diag > corr_thresholds[ii+1])[0]) - num_offdiag -= len(np.where(corr_offdiag > - corr_thresholds[ii+1])[0]) - cluster_quantification[2*ii] = num_diag / (num_diag+num_offdiag) - cluster_quantification[2*ii+1] = ( - num_diag+num_offdiag) / (len(indices_all)**2) + step_width=0.02 + corr_thresholds = np.linspace(0+step_width, 1, int(1/step_width)) + p_diag = np.array([len(np.where(corr_offdiag <= corr_thresholds[i])[0]) + for i in range(len(corr_thresholds))])/len(corr_offdiag)*step_width + p_offdiag = np.array([len(np.where(corr_diag < corr_thresholds[i])[0]) + for i in range(len(corr_thresholds))])/len(corr_diag)*step_width + + print(p_diag.sum(), p_offdiag.sum()) - # print scores on clustering - print("Number of clusters: " + str(len(clusters)) + - ", shares diag/all between [0.25, 0.5, 0.75]: %.4f" % - cluster_quantification[0] + " (%.4f" % cluster_quantification[1] + - "), " + " %.4f " % cluster_quantification[2] + " (%.4f" % - cluster_quantification[3] + "), " + " %.4f " % - cluster_quantification[4] + " (%.4f" % cluster_quantification[5] + ")") - - return cluster_quantification + return clusters, p_diag.sum()+p_offdiag.sum() def compute_hierarch_clustering(corr_mat, corr_pairwdist, @@ -158,6 +146,9 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): # allow single entries if not isinstance(weights, list): weights = [weights] + + total_eval_number = [[] for weight in weights] + n=0 # iterate over weights for weight in weights: # use the given weight to flatten the dendrogram @@ -165,12 +156,16 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): cluster_hierarch, weight, criterion='distance') # evaluate clustering - evaluate_clustering(corr_mat, npi_idx_to_cluster_idx, npi_indices_all) + clusters, total_eval_number[n]=evaluate_clustering(corr_mat, npi_idx_to_cluster_idx, npi_indices_all) # append new npi_idx to cluster_idx assignment to list of assignments npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) + n+=1 + + # print scores on clustering + print("Number of clusters: " + str(len(clusters)) + "; evaluation number: " + str(round(np.nanmax(abs(np.array(total_eval_number))), 4))) - return npi_idx_to_cluster_idx_list + return npi_idx_to_cluster_idx_list[total_eval_number.index(np.nanmax(abs(np.array(total_eval_number))))] def analyze_npi_data( @@ -324,12 +319,12 @@ def analyze_npi_data( plt.figure() plt.title(metric) hierarchy.dendrogram(cluster_hierarch) - plt.show() + # plt.show() max_coph_dist = coph_dist.max() flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist - for wg in [0.6, 0.625, 0.65, 0.675, 0.7, 0.725, 0.75]]) + for wg in np.linspace(0.01,1,100)]) # ward metric = 'ward' cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( @@ -344,7 +339,7 @@ def analyze_npi_data( max_coph_dist = coph_dist.max() flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, - [wg * max_coph_dist for wg in [0.1, 0.125, 0.15, 0.175, 0.2]]) + [wg * max_coph_dist for wg in np.linspace(0.01,1,100)]) # average metric = 'average' cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( @@ -360,7 +355,7 @@ def analyze_npi_data( flatten_hierarch_clustering( npis_corr, cluster_hierarch, [wg * max_coph_dist - for wg in [0.475, 0.5, 0.525, 0.55, 0.575, 0.6, 0.625, 0.65]]) + for wg in np.linspace(0.01,1,100)]) metric = 'centroid' cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( @@ -379,14 +374,14 @@ def analyze_npi_data( for wg in [0.50]]) # 10 cluster cluster_dict = dict() - cluster_codes = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] - cluster_desc = [[] for i in range(npi_idx_to_cluster_idx[0].max()+1)] - for i in range(len(npi_idx_to_cluster_idx[0])): + cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max()+1)] + cluster_desc = [[] for i in range(npi_idx_to_cluster_idx.max()+1)] + for i in range(len(npi_idx_to_cluster_idx)): cluster_dict[npi_codes_used[i] - ] = "CM_" + str(npi_idx_to_cluster_idx[0][i]).zfill(3) - cluster_codes[npi_idx_to_cluster_idx[0] + ] = "CM_" + str(npi_idx_to_cluster_idx[i]).zfill(3) + cluster_codes[npi_idx_to_cluster_idx [i]].append(npi_codes_used[i]) - cluster_desc[npi_idx_to_cluster_idx[0] + cluster_desc[npi_idx_to_cluster_idx [i]].append(str(npis_used[i])) # create clustered dataframe From c1c97aba04919478201964db0969808d6f5d8ed0 Mon Sep 17 00:00:00 2001 From: annawendler Date: Fri, 7 Jul 2023 12:20:50 +0200 Subject: [PATCH 06/30] Renaming + discussion points --- .../memilio/epidata/assessNPIEffects.py | 115 ++++++++++-------- 1 file changed, 64 insertions(+), 51 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index d3fb86d2cb..cdc3105015 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -75,12 +75,15 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): return clusters, p_diag.sum()+p_offdiag.sum() +# TODO: Used name 'methods' instead of 'metrics'. This is conform with documentation of hierarchy.linkage + # and does not lead to confusion with metric used in pdist. To discuss! + def compute_hierarch_clustering(corr_mat, corr_pairwdist, - metrics=['single', 'complete', 'average', + methods=['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']): - """! Computes a hierarchical clustering for a (list of) metric(s) and + """! Computes a hierarchical clustering for a (list of) method(s) and provides the maximum cophenetic distance(s) as well as a score for the clustering (see @method evaluate_clustering(...)). @@ -88,41 +91,46 @@ def compute_hierarch_clustering(corr_mat, corr_pairwdist, to be clustered hierarchically. @param corr_pairwdist Computed pairwise distance between the features / data set items. - @param metric Metric or list of metrics to compute the hierarchical + @param method method or list of methods to compute the hierarchical clustering. @return (List of) hierarchical clustering(s), maximum cophenetic distance(s) and scores of the hierarchical clustering. """ - # NOTE: if changing metric, pay attention to linkage methods; + + # NOTE: if changing method, pay attention to linkage methods; # 'centroid', 'median', and 'ward' are correctly defined only if - # Euclidean pairwise metric is used. + # Euclidean pairwise metric is used in distance matrix that we used as input. + # Based on the distances, we compute an hierarchical clustering for - # different metrics - max_coph_corr = 0 + # different methods + max_coph_corr_dist = 0 scores = dict() # allow single entry - if not isinstance(metrics, list): - metrics = [metrics] + if not isinstance(methods, list): + methods = [methods] # iterate over list - for metric in metrics: - cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=metric) - # compute cophenetic correlation distance - coph_corr, coph_dists = hierarchy.cophenet( - cluster_hierarch, pdist(corr_mat)) - scores[metric] = coph_corr - if coph_corr > max_coph_corr: - max_coph_corr = coph_corr - max_metric = metric - max_coph_dist = coph_dists - - cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_metric) + for method in methods: + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=method) + # compute cophenetic correlation distance and cophenetic distance matrix + # TODO: Why was pdist(corr_mat) used as input for hierarchy.cophenet? + # Shouldn't we use the same distance matrix as input both for linkage and cophenet? + # To discuss! + coph_corr_dist, coph_dist_mat = hierarchy.cophenet( + cluster_hierarch, corr_pairwdist) + scores[method] = coph_corr_dist + if coph_corr_dist > max_coph_corr_dist: + max_coph_corr_dist = coph_corr_dist + max_method = method + max_coph_dist_mat = coph_dist_mat + + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_method) print( - "Cophentic correlation distance for metric " + max_metric + ": " + - str(max_coph_corr)) + "Cophenetic correlation distance for method " + max_method + ": " + + str(max_coph_corr_dist)) - return cluster_hierarch, max_coph_dist, scores + return cluster_hierarch, max_coph_dist_mat, scores def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): @@ -306,68 +314,73 @@ def analyze_npi_data( corr_pairwdist = hierarchy.distance.pdist( npis_corr, metric='euclidean') - # compute hierarchical clustering (via best-suited metric) - compare_metrics = True - if compare_metrics: + # compute hierarchical clustering (via best-suited method) + compare_methods = True + if compare_methods: + # centroid - metric = 'centroid' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + method = 'centroid' + cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( abs(npis_corr), corr_pairwdist, - metric) + method) # # plot dendrogram plt.figure() - plt.title(metric) + plt.title(method) hierarchy.dendrogram(cluster_hierarch) # plt.show() - max_coph_dist = coph_dist.max() + max_coph_dist = coph_dist_mat.max() + # TODO: Discuss why abs(npis_corr) is used as input and not corr_pairwdist flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.01,1,100)]) + for wg in np.linspace(0.01, 1, 100)]) + # ward - metric = 'ward' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + method = 'ward' + cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( npis_corr, corr_pairwdist, - metric) + method) # # plot dendrogram # plt.figure() - # plt.title(metric) + # plt.title(method) # hierarchy.dendrogram(cluster_hierarch) # plt.show() - max_coph_dist = coph_dist.max() + max_coph_dist = coph_dist_mat.max() flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, - [wg * max_coph_dist for wg in np.linspace(0.01,1,100)]) + [wg * max_coph_dist for wg in np.linspace(0.01, 1, 100)]) + # average - metric = 'average' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( + method = 'average' + cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( abs(npis_corr), corr_pairwdist, - metric) + method) # # plot dendrogram # plt.figure() - # plt.title(metric) + # plt.title(method) # hierarchy.dendrogram(cluster_hierarch) # plt.show() - max_coph_dist = coph_dist.max() + max_coph_dist = coph_dist_mat.max() flatten_hierarch_clustering( npis_corr, cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.01,1,100)]) + for wg in np.linspace(0.01, 1, 100)]) - metric = 'centroid' - cluster_hierarch, coph_dist, scores = compute_hierarch_clustering( - npis_corr, # same result as abs(npis_corr) + # centroid + method = 'centroid' + cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( + npis_corr, # same result as abs(npis_corr) corr_pairwdist, - metric) + method) # # plot dendrogram # plt.figure() - # plt.title(metric) + # plt.title(method) # hierarchy.dendrogram(cluster_hierarch) # plt.show() - max_coph_dist = coph_dist.max() + max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( npis_corr, cluster_hierarch, [wg * max_coph_dist From 98c184d36ee1f5e94a09754977ecd49012f36785 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 17 Jul 2023 15:03:29 +0200 Subject: [PATCH 07/30] use time period for clustering --- .../memilio/epidata/assessNPIEffects.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index cdc3105015..b6d707cf4d 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -18,13 +18,14 @@ # limitations under the License. ############################################################################# import os - +from datetime import date import matplotlib.pyplot as plt import numpy as np import pandas as pd from memilio.epidata import customPlot from memilio.epidata import defaultDict as dd from memilio.epidata import getDataIntoPandasDataFrame as gd +from memilio.epidata import modifyDataframeSeries as mdfs from scipy.cluster import hierarchy from scipy.spatial.distance import pdist @@ -207,6 +208,7 @@ def analyze_npi_data( df_npis = df_npis.drop('Unnamed: 0', axis=1) except KeyError: pass + df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) npis = pd.read_json(os.path.join(directory, 'npis.json')) # merge subcodes of considered npis (Do we want this? To discuss...) # get code levels (main/subcodes) and position of main codes @@ -342,11 +344,11 @@ def analyze_npi_data( npis_corr, corr_pairwdist, method) - # # plot dendrogram - # plt.figure() - # plt.title(method) - # hierarchy.dendrogram(cluster_hierarch) - # plt.show() + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + plt.show() max_coph_dist = coph_dist_mat.max() flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, @@ -358,11 +360,11 @@ def analyze_npi_data( abs(npis_corr), corr_pairwdist, method) - # # plot dendrogram - # plt.figure() - # plt.title(method) - # hierarchy.dendrogram(cluster_hierarch) - # plt.show() + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + plt.show() max_coph_dist = coph_dist_mat.max() flatten_hierarch_clustering( npis_corr, cluster_hierarch, @@ -375,11 +377,11 @@ def analyze_npi_data( npis_corr, # same result as abs(npis_corr) corr_pairwdist, method) - # # plot dendrogram - # plt.figure() - # plt.title(method) - # hierarchy.dendrogram(cluster_hierarch) - # plt.show() + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + plt.show() max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( npis_corr, cluster_hierarch, @@ -476,6 +478,7 @@ def analyze_npi_data( 'Counties implementing NPI main categories', 'Date', 'Number', "Counties_NPI_main_" + str(j) + "_of_"+str(num_images)) + plt.tight_layout() j += 1 From 7984ec2ba72bf481ece66104ef63df6f8e3f003a Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 14 Aug 2023 15:46:33 +0200 Subject: [PATCH 08/30] Use Silhouette value to evaluate clustering --- .../memilio/epidata/assessNPIEffects.py | 122 +++++++++++++----- 1 file changed, 93 insertions(+), 29 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index b6d707cf4d..9526f3b37c 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -18,17 +18,15 @@ # limitations under the License. ############################################################################# import os -from datetime import date import matplotlib.pyplot as plt +import matplotlib.cm as cm import numpy as np import pandas as pd from memilio.epidata import customPlot from memilio.epidata import defaultDict as dd from memilio.epidata import getDataIntoPandasDataFrame as gd -from memilio.epidata import modifyDataframeSeries as mdfs from scipy.cluster import hierarchy -from scipy.spatial.distance import pdist - +from sklearn.metrics import silhouette_samples, silhouette_score def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): """! Computes a score for a particular clustering based on the @@ -56,6 +54,9 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): clusters_perp = [[] for i in range(idx_to_cluster_idx.max()+1)] for ii in range(len(clusters)): clusters_perp[ii] = list(indices_all.difference(set(clusters[ii]))) + + ### old method + ''' # extract correlation values of block diagonals and offdiagonals separ. corr_diag = [] corr_offdiag = [] @@ -71,16 +72,24 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): for i in range(len(corr_thresholds))])/len(corr_offdiag)*step_width p_offdiag = np.array([len(np.where(corr_diag < corr_thresholds[i])[0]) for i in range(len(corr_thresholds))])/len(corr_diag)*step_width - - print(p_diag.sum(), p_offdiag.sum()) + return clusters, p_diag.sum()+p_offdiag.sum() + ''' + ### new method + if idx_to_cluster_idx.max() > 0: + sample_silhouette_values = silhouette_samples(corr_mat, idx_to_cluster_idx) + + if (idx_to_cluster_idx.max() < 15): + return clusters, -1 + else: + return clusters, sample_silhouette_values.min() # TODO: Used name 'methods' instead of 'metrics'. This is conform with documentation of hierarchy.linkage # and does not lead to confusion with metric used in pdist. To discuss! -def compute_hierarch_clustering(corr_mat, corr_pairwdist, +def compute_hierarch_clustering(corr_pairwdist, methods=['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward']): @@ -117,6 +126,7 @@ def compute_hierarch_clustering(corr_mat, corr_pairwdist, # TODO: Why was pdist(corr_mat) used as input for hierarchy.cophenet? # Shouldn't we use the same distance matrix as input both for linkage and cophenet? # To discuss! + # TODO: < or > max_coph_corr_dist? coph_corr_dist, coph_dist_mat = hierarchy.cophenet( cluster_hierarch, corr_pairwdist) scores[method] = coph_corr_dist @@ -131,7 +141,7 @@ def compute_hierarch_clustering(corr_mat, corr_pairwdist, "Cophenetic correlation distance for method " + max_method + ": " + str(max_coph_corr_dist)) - return cluster_hierarch, max_coph_dist_mat, scores + return cluster_hierarch, max_coph_dist_mat def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): @@ -172,10 +182,62 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): n+=1 # print scores on clustering - print("Number of clusters: " + str(len(clusters)) + "; evaluation number: " + str(round(np.nanmax(abs(np.array(total_eval_number))), 4))) + print("Number of clusters: " + str(len(clusters)) + "; evaluation number: " + str(round(np.nanmax(np.array(total_eval_number)), 4))) + + return npi_idx_to_cluster_idx_list[total_eval_number.index(np.nanmax(np.array(total_eval_number)))] + +def silhouette(X, cluster_sizes, cluster_labels, label): + + if not isinstance(cluster_sizes, list): + cluster_sizes = [cluster_sizes] + + plt.figure() + + for n_clusters in cluster_sizes: + + sample_silhouette_values = silhouette_samples(X, cluster_labels) - return npi_idx_to_cluster_idx_list[total_eval_number.index(np.nanmax(abs(np.array(total_eval_number))))] + y_lower = 10 + for i in range(n_clusters): + ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = cm.hsv(float(i) / n_clusters) + plt.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) + + plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) + + y_lower = y_upper + 10 + + plt.xlabel("The silhouette coefficient values") + plt.ylabel("Cluster label") + + plt.axvline(x=0, c='black', alpha = 0.3) + plt.axvline(x=0.25, c='black', alpha = 0.3) + plt.axvline(x=0.5, c='black', alpha = 0.3) + plt.axvline(x=0.75, c='black', alpha = 0.3) + + plt.yticks([]) # Clear the yaxis labels / ticks + plt.xticks([-0.4,-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) + + plt.title( + "Silhouette analysis for " + label + " clustering with n_clusters = " + str(n_clusters), + ) + + plt.show(block=False) + + return sample_silhouette_values def analyze_npi_data( read_data, make_plot, fine_resolution, npis, directory, file_format, @@ -208,9 +270,8 @@ def analyze_npi_data( df_npis = df_npis.drop('Unnamed: 0', axis=1) except KeyError: pass - df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) + #df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) npis = pd.read_json(os.path.join(directory, 'npis.json')) - # merge subcodes of considered npis (Do we want this? To discuss...) # get code levels (main/subcodes) and position of main codes # code_level = [i.count('_') for i in npi_codes] # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] @@ -313,8 +374,11 @@ def analyze_npi_data( # to the others as one node in the #NPIs-used-dimensional space. # We compute the pairwise distances of these nodes. Then, nodes with # similar correlations towards all other nodes exhibit small distances + # TODO: difference between scipy.cluster.hierarchy.distance.pdist and scipy.spatial.distance.pdist? corr_pairwdist = hierarchy.distance.pdist( npis_corr, metric='euclidean') + + npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) # compute hierarchical clustering (via best-suited method) compare_methods = True @@ -322,59 +386,59 @@ def analyze_npi_data( # centroid method = 'centroid' - cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( - abs(npis_corr), + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( corr_pairwdist, method) - # # plot dendrogram + # plot dendrogram plt.figure() plt.title(method) hierarchy.dendrogram(cluster_hierarch) # plt.show() max_coph_dist = coph_dist_mat.max() # TODO: Discuss why abs(npis_corr) is used as input and not corr_pairwdist - flatten_hierarch_clustering( + npi_idx_to_cluster_idx = flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.01, 1, 100)]) + for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max()+1, npi_idx_to_cluster_idx, label = method) # ward method = 'ward' - cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( - npis_corr, + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( corr_pairwdist, method) # plot dendrogram plt.figure() plt.title(method) hierarchy.dendrogram(cluster_hierarch) - plt.show() max_coph_dist = coph_dist_mat.max() - flatten_hierarch_clustering( + npi_idx_to_cluster_idx = flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, - [wg * max_coph_dist for wg in np.linspace(0.01, 1, 100)]) + [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max()+1, npi_idx_to_cluster_idx, label = method) # average method = 'average' - cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( - abs(npis_corr), + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( corr_pairwdist, method) # plot dendrogram plt.figure() plt.title(method) hierarchy.dendrogram(cluster_hierarch) - plt.show() max_coph_dist = coph_dist_mat.max() - flatten_hierarch_clustering( + npi_idx_to_cluster_idx = flatten_hierarch_clustering( npis_corr, cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.01, 1, 100)]) + for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max()+1, npi_idx_to_cluster_idx, label = method) # centroid method = 'centroid' - cluster_hierarch, coph_dist_mat, scores = compute_hierarch_clustering( - npis_corr, # same result as abs(npis_corr) + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( corr_pairwdist, method) # plot dendrogram From 1761a0d4e17da88bcc02ffa5ec6a22114cb0004a Mon Sep 17 00:00:00 2001 From: patricklnz Date: Tue, 12 Sep 2023 14:09:08 +0200 Subject: [PATCH 09/30] clustering to 55 clusters --- .../memilio/epidata/assessNPIEffects.py | 133 +++++++++++------- 1 file changed, 82 insertions(+), 51 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 9526f3b37c..27b71f7d11 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -26,7 +26,8 @@ from memilio.epidata import defaultDict as dd from memilio.epidata import getDataIntoPandasDataFrame as gd from scipy.cluster import hierarchy -from sklearn.metrics import silhouette_samples, silhouette_score +from sklearn.metrics import silhouette_samples + def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): """! Computes a score for a particular clustering based on the @@ -55,7 +56,7 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): for ii in range(len(clusters)): clusters_perp[ii] = list(indices_all.difference(set(clusters[ii]))) - ### old method + # old method ''' # extract correlation values of block diagonals and offdiagonals separ. corr_diag = [] @@ -76,14 +77,18 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): return clusters, p_diag.sum()+p_offdiag.sum() ''' - ### new method + # new method if idx_to_cluster_idx.max() > 0: - sample_silhouette_values = silhouette_samples(corr_mat, idx_to_cluster_idx) + sample_silhouette_values = silhouette_samples( + corr_mat, idx_to_cluster_idx) + else: + return clusters, np.nan - if (idx_to_cluster_idx.max() < 15): - return clusters, -1 + # get minimal cluster size with at least weak structuring (silhouette value 0 or greater than 0.25) in all clusters + if not ((sample_silhouette_values > 0.25) | (sample_silhouette_values == 0)).all(): + return clusters, np.nan else: - return clusters, sample_silhouette_values.min() + return clusters, idx_to_cluster_idx.max()+1 # TODO: Used name 'methods' instead of 'metrics'. This is conform with documentation of hierarchy.linkage # and does not lead to confusion with metric used in pdist. To discuss! @@ -167,7 +172,7 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): weights = [weights] total_eval_number = [[] for weight in weights] - n=0 + n = 0 # iterate over weights for weight in weights: # use the given weight to flatten the dendrogram @@ -175,16 +180,23 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): cluster_hierarch, weight, criterion='distance') # evaluate clustering - clusters, total_eval_number[n]=evaluate_clustering(corr_mat, npi_idx_to_cluster_idx, npi_indices_all) + #clusters, total_eval_number[n] = evaluate_clustering( + # corr_mat, npi_idx_to_cluster_idx, npi_indices_all) # append new npi_idx to cluster_idx assignment to list of assignments - npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) - n+=1 - + #npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) + #n += 1 + # get around 55 clusters + if npi_idx_to_cluster_idx.max()>45: + if npi_idx_to_cluster_idx.max()<65: + print(npi_idx_to_cluster_idx.max()) + return npi_idx_to_cluster_idx + # print scores on clustering - print("Number of clusters: " + str(len(clusters)) + "; evaluation number: " + str(round(np.nanmax(np.array(total_eval_number)), 4))) + #print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) + + return npi_idx_to_cluster_idx_list[total_eval_number.index(np.nanmin(np.array(total_eval_number)))] - return npi_idx_to_cluster_idx_list[total_eval_number.index(np.nanmax(np.array(total_eval_number)))] def silhouette(X, cluster_sizes, cluster_labels, label): @@ -223,22 +235,24 @@ def silhouette(X, cluster_sizes, cluster_labels, label): plt.xlabel("The silhouette coefficient values") plt.ylabel("Cluster label") - plt.axvline(x=0, c='black', alpha = 0.3) - plt.axvline(x=0.25, c='black', alpha = 0.3) - plt.axvline(x=0.5, c='black', alpha = 0.3) - plt.axvline(x=0.75, c='black', alpha = 0.3) + plt.axvline(x=0, c='black', alpha=0.3) + plt.axvline(x=0.25, c='black', alpha=0.3) + plt.axvline(x=0.5, c='black', alpha=0.3) + plt.axvline(x=0.75, c='black', alpha=0.3) plt.yticks([]) # Clear the yaxis labels / ticks - plt.xticks([-0.4,-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) + plt.xticks([-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.title( - "Silhouette analysis for " + label + " clustering with n_clusters = " + str(n_clusters), + "Silhouette analysis for " + label + + " clustering with n_clusters = " + str(n_clusters), ) - - plt.show(block=False) + plt.tight_layout() + plt.show() return sample_silhouette_values + def analyze_npi_data( read_data, make_plot, fine_resolution, npis, directory, file_format, npi_codes_considered): @@ -270,16 +284,25 @@ def analyze_npi_data( df_npis = df_npis.drop('Unnamed: 0', axis=1) except KeyError: pass - #df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) + # df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) npis = pd.read_json(os.path.join(directory, 'npis.json')) - # get code levels (main/subcodes) and position of main codes - # code_level = [i.count('_') for i in npi_codes] - # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] + # get code levels (main/subcodes) and position of main codes + # code_level = [i.count('_') for i in npi_codes] + # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] # check if any other integer than 0: not implemented or 1: implemented is # used (maybe to specify the kind of implementation) - npi_codes_considered = npis.NPI_code.values.tolist()#[x for x in npis.NPI_code if len(x) <=8] + all_subcodes = [x for x in npis.NPI_code if len(x) <=8] + df_merged = df_npis.iloc[:, :2] + for subcode in all_subcodes: + # extract columns which have the subcode as part of the column + # name and sum over all these subcodes + df_merged[subcode] = df_npis.filter( + regex=subcode).sum(axis=1) + #npi_codes_considered = npis.NPI_code.values.tolist() + npi_codes_considered = all_subcodes + df_npis = df_merged.copy() if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: @@ -304,14 +327,16 @@ def analyze_npi_data( if npi_codes_considered[i] in npi_codes_empty: npi_unused_indices.append(i) npi_unused_indices_all.append( - np.where(npis[dd.EngEng['npiCode']] == 'M01a_010')[0][0]) + np.where(npis[dd.EngEng['npiCode']] == npi_codes_considered[i])[0][0]) else: npi_used_indices.append(i) npi_used_indices_all.append( - np.where(npis[dd.EngEng['npiCode']] == 'M01a_010')[0][0]) + np.where(npis[dd.EngEng['npiCode']] == npi_codes_considered[i])[0][0]) - npis_unused = np.array(npis['Description'])[npi_unused_indices_all] # for all fr1 codes: len=153 - npis_used = np.array(npis['Description'])[npi_used_indices_all] # for all fr1 codes: len=14 + npis_unused = np.array(npis['Description'])[ + npi_unused_indices_all] # for all fr1 codes: len=12 + npis_used = np.array(npis['Description'])[ + npi_used_indices_all] # for all fr1 codes: len=155 npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) npi_codes_unused = list( np.array(npi_codes_considered)[npi_unused_indices]) @@ -377,11 +402,11 @@ def analyze_npi_data( # TODO: difference between scipy.cluster.hierarchy.distance.pdist and scipy.spatial.distance.pdist? corr_pairwdist = hierarchy.distance.pdist( npis_corr, metric='euclidean') - + npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) # compute hierarchical clustering (via best-suited method) - compare_methods = True + compare_methods = False if compare_methods: # centroid @@ -400,8 +425,9 @@ def analyze_npi_data( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - - samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max()+1, npi_idx_to_cluster_idx, label = method) + + samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max( + )+1, npi_idx_to_cluster_idx, label=method) # ward method = 'ward' @@ -416,8 +442,9 @@ def analyze_npi_data( npi_idx_to_cluster_idx = flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - - samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max()+1, npi_idx_to_cluster_idx, label = method) + + samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max( + )+1, npi_idx_to_cluster_idx, label=method) # average method = 'average' @@ -432,11 +459,12 @@ def analyze_npi_data( npi_idx_to_cluster_idx = flatten_hierarch_clustering( npis_corr, cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.01, 1, 500)]) - - samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max()+1, npi_idx_to_cluster_idx, label = method) + for wg in np.linspace(0.01, 1, 500)]) - # centroid + samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max( + )+1, npi_idx_to_cluster_idx, label=method) + + # centroid has less clusters method = 'centroid' cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( corr_pairwdist, @@ -448,20 +476,23 @@ def analyze_npi_data( plt.show() max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr, cluster_hierarch, - [wg * max_coph_dist - for wg in [0.50]]) # 10 cluster + abs(npis_corr), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.01, 1, 100)]) + + silhouette(npis_corr, npi_idx_to_cluster_idx.max( + )+1, npi_idx_to_cluster_idx, label=method) cluster_dict = dict() - cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max()+1)] - cluster_desc = [[] for i in range(npi_idx_to_cluster_idx.max()+1)] + cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max())] + cluster_desc = [[] for i in range(npi_idx_to_cluster_idx.max())] for i in range(len(npi_idx_to_cluster_idx)): cluster_dict[npi_codes_used[i] - ] = "CM_" + str(npi_idx_to_cluster_idx[i]).zfill(3) + ] = "CM_" + str(npi_idx_to_cluster_idx[i]-1).zfill(3) cluster_codes[npi_idx_to_cluster_idx - [i]].append(npi_codes_used[i]) + [i]-1].append(npi_codes_used[i]) cluster_desc[npi_idx_to_cluster_idx - [i]].append(str(npis_used[i])) + [i]-1].append(str(npis_used[i])) # create clustered dataframe df_npis_clustered = df_npis[[ @@ -471,7 +502,7 @@ def analyze_npi_data( df_npis_clustered["CM_" + str(i).zfill(3) ] = df_npis[cluster_codes[i]].max(axis=1).copy() - npis_corr_cluster = df_npis_clustered.corr() + npis_corr_cluster = df_npis_clustered.iloc[:,2:].corr() # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 plt.imshow(abs(npis_corr_cluster), cmap='gray_r') plt.title('Absolute correlation>0.25 of clustered NPIs') @@ -498,7 +529,7 @@ def analyze_npi_data( # Closing file file_npi.close() - npi_idx_new = np.argsort(npi_idx_to_cluster_idx[0]) + npi_idx_new = np.argsort(npi_idx_to_cluster_idx) npis_corr_reorder = npis_corr[npi_idx_new, :][:, npi_idx_new] plt.imshow(abs(npis_corr_reorder), cmap='gray_r') From 09bc94576fb471ed2c37925a37773dd79f484426 Mon Sep 17 00:00:00 2001 From: Patrick Lenz Date: Mon, 15 Jan 2024 11:09:10 +0100 Subject: [PATCH 10/30] review suggestions --- .../memilio/epidata/assessNPIEffects.py | 75 +++++++++---------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 27b71f7d11..f7aa1ff684 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -25,9 +25,14 @@ from memilio.epidata import customPlot from memilio.epidata import defaultDict as dd from memilio.epidata import getDataIntoPandasDataFrame as gd +from memilio.epidata import getNPIData as gnpi from scipy.cluster import hierarchy +from scipy.spatial import distance from sklearn.metrics import silhouette_samples +# activate CoW for more predictable behaviour of pandas DataFrames +pd.options.mode.copy_on_write = True + def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): """! Computes a score for a particular clustering based on the @@ -90,9 +95,6 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): else: return clusters, idx_to_cluster_idx.max()+1 -# TODO: Used name 'methods' instead of 'metrics'. This is conform with documentation of hierarchy.linkage - # and does not lead to confusion with metric used in pdist. To discuss! - def compute_hierarch_clustering(corr_pairwdist, methods=['single', 'complete', 'average', @@ -102,8 +104,6 @@ def compute_hierarch_clustering(corr_pairwdist, provides the maximum cophenetic distance(s) as well as a score for the clustering (see @method evaluate_clustering(...)). - @param corr_mat correlation matrix between the features / data set items - to be clustered hierarchically. @param corr_pairwdist Computed pairwise distance between the features / data set items. @param method method or list of methods to compute the hierarchical @@ -128,9 +128,6 @@ def compute_hierarch_clustering(corr_pairwdist, for method in methods: cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=method) # compute cophenetic correlation distance and cophenetic distance matrix - # TODO: Why was pdist(corr_mat) used as input for hierarchy.cophenet? - # Shouldn't we use the same distance matrix as input both for linkage and cophenet? - # To discuss! # TODO: < or > max_coph_corr_dist? coph_corr_dist, coph_dist_mat = hierarchy.cophenet( cluster_hierarch, corr_pairwdist) @@ -180,15 +177,15 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): cluster_hierarch, weight, criterion='distance') # evaluate clustering - #clusters, total_eval_number[n] = evaluate_clustering( + # clusters, total_eval_number[n] = evaluate_clustering( # corr_mat, npi_idx_to_cluster_idx, npi_indices_all) # append new npi_idx to cluster_idx assignment to list of assignments - #npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) + # npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) #n += 1 # get around 55 clusters - if npi_idx_to_cluster_idx.max()>45: - if npi_idx_to_cluster_idx.max()<65: + if npi_idx_to_cluster_idx.max() > 45: + if npi_idx_to_cluster_idx.max() < 65: print(npi_idx_to_cluster_idx.max()) return npi_idx_to_cluster_idx @@ -258,14 +255,13 @@ def analyze_npi_data( npi_codes_considered): if not read_data: - raise FileNotFoundError('') - # transform_npi_data(fine_resolution=2, - # file_format=dd.defaultDict['file_format'], - # out_folder=dd.defaultDict['out_folder'], - # start_date=dd.defaultDict['start_date'], - # end_date=dd.defaultDict['end_date'], - # make_plot=dd.defaultDict['make_plot'], - # ) + gnpi.get_npi_data(fine_resolution=2, + file_format=dd.defaultDict['file_format'], + out_folder=dd.defaultDict['out_folder'], + start_date=dd.defaultDict['start_date'], + end_date=dd.defaultDict['end_date'], + make_plot=dd.defaultDict['make_plot'], + ) else: # read formatted file npis = pd.read_excel( @@ -287,24 +283,26 @@ def analyze_npi_data( # df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) npis = pd.read_json(os.path.join(directory, 'npis.json')) # get code levels (main/subcodes) and position of main codes - # code_level = [i.count('_') for i in npi_codes] - # main_code_pos = [i for i in range(len(code_level)) if code_level[i] == 1] # check if any other integer than 0: not implemented or 1: implemented is # used (maybe to specify the kind of implementation) - all_subcodes = [x for x in npis.NPI_code if len(x) <=8] + if not npi_codes_considered: + npi_codes_considered = [ + x for x in npis[dd.EngEng['npiCode']] if len(x.split('_')) == 2] df_merged = df_npis.iloc[:, :2] - for subcode in all_subcodes: + for subcode in npi_codes_considered: # extract columns which have the subcode as part of the column # name and sum over all these subcodes df_merged[subcode] = df_npis.filter( regex=subcode).sum(axis=1) - #npi_codes_considered = npis.NPI_code.values.tolist() - npi_codes_considered = all_subcodes - df_npis = df_merged.copy() - if len(np.where(df_npis[npi_codes_considered] > 1)[0]) > 0: + # make sure that only considered subcodes are in npis dataframe + npis = npis[npis[dd.EngEng['npiCode']].isin(npi_codes_considered)] + + df_npis = df_merged[:] + + if ~df_npis[npi_codes_considered].isin([0, 1]).any().any(): print("Info: Please ensure that NPI information is only boolean.") @@ -399,8 +397,7 @@ def analyze_npi_data( # to the others as one node in the #NPIs-used-dimensional space. # We compute the pairwise distances of these nodes. Then, nodes with # similar correlations towards all other nodes exhibit small distances - # TODO: difference between scipy.cluster.hierarchy.distance.pdist and scipy.spatial.distance.pdist? - corr_pairwdist = hierarchy.distance.pdist( + corr_pairwdist = distance.pdist( npis_corr, metric='euclidean') npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) @@ -476,23 +473,23 @@ def analyze_npi_data( plt.show() max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - abs(npis_corr), cluster_hierarch, - [wg * max_coph_dist - for wg in np.linspace(0.01, 1, 100)]) - + abs(npis_corr), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.01, 1, 100)]) + silhouette(npis_corr, npi_idx_to_cluster_idx.max( - )+1, npi_idx_to_cluster_idx, label=method) + )+1, npi_idx_to_cluster_idx, label=method) cluster_dict = dict() cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max())] cluster_desc = [[] for i in range(npi_idx_to_cluster_idx.max())] for i in range(len(npi_idx_to_cluster_idx)): cluster_dict[npi_codes_used[i] - ] = "CM_" + str(npi_idx_to_cluster_idx[i]-1).zfill(3) + ] = "CM_" + str(npi_idx_to_cluster_idx[i]-1).zfill(3) cluster_codes[npi_idx_to_cluster_idx - [i]-1].append(npi_codes_used[i]) + [i]-1].append(npi_codes_used[i]) cluster_desc[npi_idx_to_cluster_idx - [i]-1].append(str(npis_used[i])) + [i]-1].append(str(npis_used[i])) # create clustered dataframe df_npis_clustered = df_npis[[ @@ -502,7 +499,7 @@ def analyze_npi_data( df_npis_clustered["CM_" + str(i).zfill(3) ] = df_npis[cluster_codes[i]].max(axis=1).copy() - npis_corr_cluster = df_npis_clustered.iloc[:,2:].corr() + npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 plt.imshow(abs(npis_corr_cluster), cmap='gray_r') plt.title('Absolute correlation>0.25 of clustered NPIs') From 445b37132ec6879ef73cdaae59aadfd6eb110fba Mon Sep 17 00:00:00 2001 From: Anna Wendler <106674756+annawendler@users.noreply.github.com> Date: Tue, 6 Feb 2024 09:43:23 +0100 Subject: [PATCH 11/30] comments from discussion and formatting --- .../memilio/epidata/assessNPIEffects.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index f7aa1ff684..6885f59305 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -182,7 +182,7 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): # append new npi_idx to cluster_idx assignment to list of assignments # npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) - #n += 1 + # n += 1 # get around 55 clusters if npi_idx_to_cluster_idx.max() > 45: if npi_idx_to_cluster_idx.max() < 65: @@ -190,7 +190,7 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): return npi_idx_to_cluster_idx # print scores on clustering - #print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) + # print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) return npi_idx_to_cluster_idx_list[total_eval_number.index(np.nanmin(np.array(total_eval_number)))] @@ -255,6 +255,7 @@ def analyze_npi_data( npi_codes_considered): if not read_data: + # return df_npis, npis or remove else from below gnpi.get_npi_data(fine_resolution=2, file_format=dd.defaultDict['file_format'], out_folder=dd.defaultDict['out_folder'], @@ -281,6 +282,7 @@ def analyze_npi_data( except KeyError: pass # df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) + # TODO: Check if we need the next line npis = pd.read_json(os.path.join(directory, 'npis.json')) # get code levels (main/subcodes) and position of main codes @@ -300,6 +302,7 @@ def analyze_npi_data( # make sure that only considered subcodes are in npis dataframe npis = npis[npis[dd.EngEng['npiCode']].isin(npi_codes_considered)] + # use copy on write of df_merged df_npis = df_merged[:] if ~df_npis[npi_codes_considered].isin([0, 1]).any().any(): @@ -307,7 +310,7 @@ def analyze_npi_data( print("Info: Please ensure that NPI information is only boolean.") else: - # sum over different NPIs and plot share of countires implementing + # sum over different NPIs and plot share of counties implementing # these NPIs versus counties without corresponding actions df_npis_aggregated = df_npis.groupby( dd.EngEng['date']).agg( @@ -355,7 +358,7 @@ def analyze_npi_data( # Closing file file_npi.close() - # open file to write unused categories + # open file to write used categories if fine_resolution > 0: if fine_resolution == 1: filename = 'used_subcats_incgrouped.txt' @@ -364,7 +367,7 @@ def analyze_npi_data( else: filename = 'used_maincats.txt' file_npi = open(directory + filename, 'w') - # Writing unused NPIs + # Writing used NPIs for i in range(len(npis_used)): file_npi.write(npi_codes_used[i] + ": " + npis_used[i]) file_npi.write("\n") From fd0d621019d4d3c986667fc1c25bae775fe507d2 Mon Sep 17 00:00:00 2001 From: Anna Wendler <106674756+annawendler@users.noreply.github.com> Date: Tue, 27 Feb 2024 12:42:29 +0100 Subject: [PATCH 12/30] discussion --- pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 6885f59305..5e6b4b93c5 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -119,7 +119,7 @@ def compute_hierarch_clustering(corr_pairwdist, # Based on the distances, we compute an hierarchical clustering for # different methods - max_coph_corr_dist = 0 + max_coph_corr_dist = -1 scores = dict() # allow single entry if not isinstance(methods, list): @@ -128,7 +128,6 @@ def compute_hierarch_clustering(corr_pairwdist, for method in methods: cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=method) # compute cophenetic correlation distance and cophenetic distance matrix - # TODO: < or > max_coph_corr_dist? coph_corr_dist, coph_dist_mat = hierarchy.cophenet( cluster_hierarch, corr_pairwdist) scores[method] = coph_corr_dist @@ -137,6 +136,7 @@ def compute_hierarch_clustering(corr_pairwdist, max_method = method max_coph_dist_mat = coph_dist_mat + # recompute hierarchical clustering with maximal cophenetic correlation distance/coefficient cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_method) print( @@ -147,6 +147,7 @@ def compute_hierarch_clustering(corr_pairwdist, def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): + # TODO: Adapt comment on cophenetic distance and weights """! Flattens a hierarchical clustering for a (list of) maximum cophenetic distance(s) in the flat clusters and evaluates the resulting clustering with respect to the corresponding correlation matrix. @@ -475,6 +476,7 @@ def analyze_npi_data( hierarchy.dendrogram(cluster_hierarch) plt.show() max_coph_dist = coph_dist_mat.max() + # TODO: Discuss why abs(npis_corr) is used as input and not corr_pairwdist npi_idx_to_cluster_idx = flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist From fcbecfd6134cf3826cb11723d8f9ca384259bc6a Mon Sep 17 00:00:00 2001 From: Anna Wendler <106674756+annawendler@users.noreply.github.com> Date: Tue, 26 Mar 2024 16:23:42 +0100 Subject: [PATCH 13/30] renaming of max_coph_corr_coeff --- .../memilio/epidata/assessNPIEffects.py | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 5e6b4b93c5..8f79918333 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -119,7 +119,7 @@ def compute_hierarch_clustering(corr_pairwdist, # Based on the distances, we compute an hierarchical clustering for # different methods - max_coph_corr_dist = -1 + max_coph_corr_coeff = -1 scores = dict() # allow single entry if not isinstance(methods, list): @@ -127,12 +127,12 @@ def compute_hierarch_clustering(corr_pairwdist, # iterate over list for method in methods: cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=method) - # compute cophenetic correlation distance and cophenetic distance matrix - coph_corr_dist, coph_dist_mat = hierarchy.cophenet( + # compute cophenetic correlation distance/coefficient and cophenetic distance matrix + coph_corr_coeff, coph_dist_mat = hierarchy.cophenet( cluster_hierarch, corr_pairwdist) - scores[method] = coph_corr_dist - if coph_corr_dist > max_coph_corr_dist: - max_coph_corr_dist = coph_corr_dist + scores[method] = coph_corr_coeff + if coph_corr_coeff > max_coph_corr_coeff: + max_coph_corr_coeff = coph_corr_coeff max_method = method max_coph_dist_mat = coph_dist_mat @@ -141,7 +141,7 @@ def compute_hierarch_clustering(corr_pairwdist, print( "Cophenetic correlation distance for method " + max_method + ": " + - str(max_coph_corr_dist)) + str(max_coph_corr_coeff)) return cluster_hierarch, max_coph_dist_mat @@ -176,6 +176,7 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): # use the given weight to flatten the dendrogram npi_idx_to_cluster_idx = hierarchy.fcluster( cluster_hierarch, weight, criterion='distance') + npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) # evaluate clustering # clusters, total_eval_number[n] = evaluate_clustering( @@ -185,15 +186,16 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): # npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) # n += 1 # get around 55 clusters - if npi_idx_to_cluster_idx.max() > 45: - if npi_idx_to_cluster_idx.max() < 65: - print(npi_idx_to_cluster_idx.max()) - return npi_idx_to_cluster_idx + # if npi_idx_to_cluster_idx.max() > 45: + # if npi_idx_to_cluster_idx.max() < 65: + # print(npi_idx_to_cluster_idx.max()) + # return npi_idx_to_cluster_idx # print scores on clustering # print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) - return npi_idx_to_cluster_idx_list[total_eval_number.index(np.nanmin(np.array(total_eval_number)))] + # [total_eval_number.index(np.nanmin(np.array(total_eval_number)))] + return npi_idx_to_cluster_idx_list[0] def silhouette(X, cluster_sizes, cluster_labels, label): @@ -587,7 +589,7 @@ def main(): npis_final = [] directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') file_format = 'json' - npi_codes_considered = ['M01a_010', 'M01a_020'] + npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100'] analyze_npi_data(True, True, fine_resolution, npis_final, directory, file_format, npi_codes_considered) From 2b8465caeec90b705184faffdecabe25947624c6 Mon Sep 17 00:00:00 2001 From: Anna Wendler <106674756+annawendler@users.noreply.github.com> Date: Wed, 27 Mar 2024 10:53:24 +0100 Subject: [PATCH 14/30] some adjustments and todos --- .../memilio/epidata/assessNPIEffects.py | 117 +++++++++++------- 1 file changed, 69 insertions(+), 48 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 8f79918333..65db7b23e9 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -28,7 +28,7 @@ from memilio.epidata import getNPIData as gnpi from scipy.cluster import hierarchy from scipy.spatial import distance -from sklearn.metrics import silhouette_samples +from sklearn.metrics import silhouette_samples, silhouette_score # activate CoW for more predictable behaviour of pandas DataFrames pd.options.mode.copy_on_write = True @@ -48,7 +48,7 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): @return Scores for the provided clustering. """ - + # TODO: Why? if idx_to_cluster_idx.min() == 1: idx_to_cluster_idx -= 1 @@ -83,17 +83,25 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): return clusters, p_diag.sum()+p_offdiag.sum() ''' # new method - if idx_to_cluster_idx.max() > 0: + # do this only if len(np.unique(idx_to_cluster_idx)) < len(indices_all), which checks that we do not have + # only clusters with one element, this is a requirement for silhouette_samples + # TODO: check here that not all clusters are in their own cluster? + # TODO: ask why it is checked that maximum is greater than 0? + if idx_to_cluster_idx.max() > 0 and len(np.unique(idx_to_cluster_idx)) < len(indices_all): + # TODO: this is just a dummy that needs to be removed later + # idx_to_cluster_idx = np.array([0, 1, 1]) sample_silhouette_values = silhouette_samples( corr_mat, idx_to_cluster_idx) else: return clusters, np.nan # get minimal cluster size with at least weak structuring (silhouette value 0 or greater than 0.25) in all clusters + # TODO: this would be easier without negation? if not ((sample_silhouette_values > 0.25) | (sample_silhouette_values == 0)).all(): return clusters, np.nan else: - return clusters, idx_to_cluster_idx.max()+1 + # return clusters and mean silhouette coefficient + return clusters, silhouette_score(corr_mat, idx_to_cluster_idx) def compute_hierarch_clustering(corr_pairwdist, @@ -146,7 +154,7 @@ def compute_hierarch_clustering(corr_pairwdist, return cluster_hierarch, max_coph_dist_mat -def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): +def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights, method): # TODO: Adapt comment on cophenetic distance and weights """! Flattens a hierarchical clustering for a (list of) maximum cophenetic distance(s) in the flat clusters and evaluates the resulting clustering with @@ -169,7 +177,8 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): if not isinstance(weights, list): weights = [weights] - total_eval_number = [[] for weight in weights] + # set total_eval_number to -1 for each weight because this is the minimal possible silhouette coefficient + total_eval_number = -1*np.ones(len(weights)) n = 0 # iterate over weights for weight in weights: @@ -179,8 +188,8 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) # evaluate clustering - # clusters, total_eval_number[n] = evaluate_clustering( - # corr_mat, npi_idx_to_cluster_idx, npi_indices_all) + clusters, total_eval_number[n] = evaluate_clustering( + corr_mat, npi_idx_to_cluster_idx, npi_indices_all) # append new npi_idx to cluster_idx assignment to list of assignments # npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) @@ -195,62 +204,70 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights): # print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) # [total_eval_number.index(np.nanmin(np.array(total_eval_number)))] - return npi_idx_to_cluster_idx_list[0] + + # return npi_idx_to_cluster_idx with highest mean silhouette coefficient + # if we get same value for multiple weights, take smallest weight + # TODO: Discuss if this is a good metric + return npi_idx_to_cluster_idx_list[np.where(total_eval_number == np.nanmax(np.array(total_eval_number)))[0][0]] def silhouette(X, cluster_sizes, cluster_labels, label): + # TODO: check for clusterings where every sample is in their own cluster + # in that case we cannot use silhouette_samples because we need 'Valid values are 2 to n_samples - 1 (inclusive)' + if not isinstance(cluster_sizes, list): cluster_sizes = [cluster_sizes] plt.figure() - for n_clusters in cluster_sizes: + if len(np.unique(cluster_labels)) < X.shape[0]: + for n_clusters in cluster_sizes: + sample_silhouette_values = silhouette_samples(X, cluster_labels) - sample_silhouette_values = silhouette_samples(X, cluster_labels) + y_lower = 10 + for i in range(n_clusters): + ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] - y_lower = 10 - for i in range(n_clusters): - ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] + ith_cluster_silhouette_values.sort() - ith_cluster_silhouette_values.sort() + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i - size_cluster_i = ith_cluster_silhouette_values.shape[0] - y_upper = y_lower + size_cluster_i + color = cm.hsv(float(i) / n_clusters) + plt.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) - color = cm.hsv(float(i) / n_clusters) - plt.fill_betweenx( - np.arange(y_lower, y_upper), - 0, - ith_cluster_silhouette_values, - facecolor=color, - edgecolor=color, - alpha=0.7, - ) + plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) - plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) + y_lower = y_upper + 10 - y_lower = y_upper + 10 + plt.xlabel("The silhouette coefficient values") + plt.ylabel("Cluster label") - plt.xlabel("The silhouette coefficient values") - plt.ylabel("Cluster label") + plt.axvline(x=0, c='black', alpha=0.3) + plt.axvline(x=0.25, c='black', alpha=0.3) + plt.axvline(x=0.5, c='black', alpha=0.3) + plt.axvline(x=0.75, c='black', alpha=0.3) - plt.axvline(x=0, c='black', alpha=0.3) - plt.axvline(x=0.25, c='black', alpha=0.3) - plt.axvline(x=0.5, c='black', alpha=0.3) - plt.axvline(x=0.75, c='black', alpha=0.3) + plt.yticks([]) # Clear the yaxis labels / ticks + plt.xticks([-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) - plt.yticks([]) # Clear the yaxis labels / ticks - plt.xticks([-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) - - plt.title( - "Silhouette analysis for " + label + - " clustering with n_clusters = " + str(n_clusters), - ) - plt.tight_layout() - plt.show() + plt.title( + "Silhouette analysis for " + label + + " clustering with n_clusters = " + str(n_clusters), + ) + plt.tight_layout() + plt.savefig('figures/silhouette_plot.png') + plt.show() - return sample_silhouette_values + return sample_silhouette_values def analyze_npi_data( @@ -476,16 +493,18 @@ def analyze_npi_data( plt.figure() plt.title(method) hierarchy.dendrogram(cluster_hierarch) + plt.savefig('figures/dendrogram.png') plt.show() max_coph_dist = coph_dist_mat.max() # TODO: Discuss why abs(npis_corr) is used as input and not corr_pairwdist npi_idx_to_cluster_idx = flatten_hierarch_clustering( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.01, 1, 100)]) + for wg in np.linspace(0.01, 1, 10)], method) - silhouette(npis_corr, npi_idx_to_cluster_idx.max( - )+1, npi_idx_to_cluster_idx, label=method) + # # TODO: ask why there was used npi_idx_to_cluster_idx.max()+1 before + silhouette(npis_corr, npi_idx_to_cluster_idx.max(), + npi_idx_to_cluster_idx, label=method) cluster_dict = dict() cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max())] @@ -570,6 +589,7 @@ def analyze_npi_data( (i + 1), len(npis_used))) for i in range( num_images + 1)]: + # TODO: plotList doesn't work customPlot.plotList(df_npis_aggregated.index, [df_npis_aggregated[code] for code in npi_codes_used[i]], @@ -589,7 +609,8 @@ def main(): npis_final = [] directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') file_format = 'json' - npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100'] + npi_codes_considered = ['M01a_010', 'M01a_020', + 'M01a_100', 'M01a_110', 'M01a_120'] analyze_npi_data(True, True, fine_resolution, npis_final, directory, file_format, npi_codes_considered) From a6db0347aea948c6bb086e50aec7f91cb3a78b0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Wed, 27 Mar 2024 11:51:23 +0100 Subject: [PATCH 15/30] working on silhouette_samples --- .../memilio/epidata/assessNPIEffects.py | 67 +++++++++---------- 1 file changed, 30 insertions(+), 37 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 65db7b23e9..2ca15d92d9 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -48,10 +48,6 @@ def evaluate_clustering(corr_mat, idx_to_cluster_idx, indices_all): @return Scores for the provided clustering. """ - # TODO: Why? - if idx_to_cluster_idx.min() == 1: - idx_to_cluster_idx -= 1 - # store indices of clusters clusters = [[] for i in range(idx_to_cluster_idx.max()+1)] for ii in range(len(idx_to_cluster_idx)): @@ -185,21 +181,17 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights, method): # use the given weight to flatten the dendrogram npi_idx_to_cluster_idx = hierarchy.fcluster( cluster_hierarch, weight, criterion='distance') + + # start clustering labels with zero + if npi_idx_to_cluster_idx.min() == 1: + npi_idx_to_cluster_idx -= 1 + npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) # evaluate clustering - clusters, total_eval_number[n] = evaluate_clustering( - corr_mat, npi_idx_to_cluster_idx, npi_indices_all) - - # append new npi_idx to cluster_idx assignment to list of assignments - # npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) - # n += 1 - # get around 55 clusters - # if npi_idx_to_cluster_idx.max() > 45: - # if npi_idx_to_cluster_idx.max() < 65: - # print(npi_idx_to_cluster_idx.max()) - # return npi_idx_to_cluster_idx - + # clusters, total_eval_number[n] = evaluate_clustering( + # corr_mat, npi_idx_to_cluster_idx, npi_indices_all) + # print scores on clustering # print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) @@ -208,22 +200,25 @@ def flatten_hierarch_clustering(corr_mat, cluster_hierarch, weights, method): # return npi_idx_to_cluster_idx with highest mean silhouette coefficient # if we get same value for multiple weights, take smallest weight # TODO: Discuss if this is a good metric - return npi_idx_to_cluster_idx_list[np.where(total_eval_number == np.nanmax(np.array(total_eval_number)))[0][0]] - + return npi_idx_to_cluster_idx_list #[np.where(total_eval_number == np.nanmax(np.array(total_eval_number)))[0][0]] -def silhouette(X, cluster_sizes, cluster_labels, label): - # TODO: check for clusterings where every sample is in their own cluster - # in that case we cannot use silhouette_samples because we need 'Valid values are 2 to n_samples - 1 (inclusive)' +def silhouette(pairw_dist_mat, cluster_labels, method): - if not isinstance(cluster_sizes, list): - cluster_sizes = [cluster_sizes] + # if only one clustering is given, surround it by a list to make loop work. + if isinstance(cluster_labels, list) and ((isinstance(cluster_labels[0], int)) or (isinstance(cluster_labels[0], np.intc))): + cluster_labels = [cluster_labels] plt.figure() - if len(np.unique(cluster_labels)) < X.shape[0]: - for n_clusters in cluster_sizes: - sample_silhouette_values = silhouette_samples(X, cluster_labels) + for clustering in cluster_labels: + + # Check for clusterings where every sample is in their own cluster or where all samples are in one cluster. + # In that case we cannot use silhouette_samples because we need 'Valid values are 2 to n_samples - 1 (inclusive)' + if (len(np.unique(clustering)) < len(clustering)) and (len(clustering) == pairw_dist_mat.shape[0]) and \ + (len(np.unique(clustering)) > 1): + + sample_silhouette_values = silhouette_samples(pairw_dist_mat, clustering, metric="precomputed") y_lower = 10 for i in range(n_clusters): @@ -267,7 +262,10 @@ def silhouette(X, cluster_sizes, cluster_labels, label): plt.savefig('figures/silhouette_plot.png') plt.show() - return sample_silhouette_values + else: + print("Dimension mismatch.") + + return sample_silhouette_values def analyze_npi_data( @@ -446,8 +444,7 @@ def analyze_npi_data( [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max( - )+1, npi_idx_to_cluster_idx, label=method) + samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, label=method) # ward method = 'ward' @@ -463,8 +460,7 @@ def analyze_npi_data( abs(npis_corr), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max( - )+1, npi_idx_to_cluster_idx, label=method) + samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, label=method) # average method = 'average' @@ -481,8 +477,7 @@ def analyze_npi_data( [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - samples = silhouette(npis_corr, npi_idx_to_cluster_idx.max( - )+1, npi_idx_to_cluster_idx, label=method) + samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) # centroid has less clusters method = 'centroid' @@ -502,9 +497,7 @@ def analyze_npi_data( [wg * max_coph_dist for wg in np.linspace(0.01, 1, 10)], method) - # # TODO: ask why there was used npi_idx_to_cluster_idx.max()+1 before - silhouette(npis_corr, npi_idx_to_cluster_idx.max(), - npi_idx_to_cluster_idx, label=method) + silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) cluster_dict = dict() cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max())] @@ -610,7 +603,7 @@ def main(): directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') file_format = 'json' npi_codes_considered = ['M01a_010', 'M01a_020', - 'M01a_100', 'M01a_110', 'M01a_120'] + 'M01a_100']#, 'M01a_110', 'M01a_120'] analyze_npi_data(True, True, fine_resolution, npis_final, directory, file_format, npi_codes_considered) From ba2e8d2676fde49a4febd31b6d1f461850a7eea6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?K=C3=BChn?= Date: Thu, 28 Mar 2024 16:21:24 +0100 Subject: [PATCH 16/30] evaluate function optmized plus minor changes --- .../memilio/epidata/assessNPIEffects.py | 609 ++++++++++++++++++ 1 file changed, 609 insertions(+) create mode 100644 pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py new file mode 100644 index 0000000000..a67a72c664 --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -0,0 +1,609 @@ +############################################################################# +# Copyright (C) 2020-2023 German Aerospace Center (DLR-SC) +# +# Authors: Martin J. Kuehn +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# +import os +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import numpy as np +import pandas as pd +from memilio.epidata import customPlot +from memilio.epidata import defaultDict as dd +from memilio.epidata import getDataIntoPandasDataFrame as gd +from memilio.epidata import getNPIData as gnpi +from scipy.cluster import hierarchy +from scipy.spatial import distance +from sklearn.metrics import silhouette_samples, silhouette_score + +# activate CoW for more predictable behaviour of pandas DataFrames +pd.options.mode.copy_on_write = True + + +def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indices_all): + """! Computes a score for a particular clustering based on the + correlation matrix. The score is computed as the percentage of 'higher' + or 'high' values (e.g., between 0.5 and 0.75 or 0.75 and 1) of the + correlation matrix that are to be found in the diagonal blocks of the + clustered correlation matrix vs these values in the offdiagonal blocks. + + @param corr_mat correlation matrix between the features / data set items + that were clustered. + @param corr_mat_pairw_dist Pairwise distance computed from correlation matrix. + @param idx_to_cluster_idx Mapping of data item to cluster index. + @param indices_all List of indices of all data items. + + @return Scores for the provided clustering. + """ + # store indices of clusters + idx_orig = np.array(range(len(idx_to_cluster_idx))) + clusters = [idx_orig[idx_to_cluster_idx == i] for i in range(idx_to_cluster_idx.max()+1)] + # store remaining/perpendicular indices for all clusters + clusters_perp = [idx_orig[idx_to_cluster_idx != i] for i in range(idx_to_cluster_idx.max()+1)] + + ssc = np.nan + # own score on diagonal and offdiagonal correlation values + # extract correlation values of block diagonals and offdiagonals separately + corr_diag = [] + corr_offdiag = [] + for ii in range(len(clusters)): + corr_diag = np.append(corr_diag, abs( + corr_mat[np.ix_(clusters[ii], clusters[ii])].flatten())) + corr_offdiag = np.append(corr_offdiag, abs( + corr_mat[np.ix_(clusters[ii], clusters_perp[ii])].flatten())) + + step_width=0.02 + nb_steps = int(1/step_width) + corr_thresholds = np.linspace(0+step_width, 1, nb_steps) + # p_offdiag should increase fast to 1 if cluster is good + p_offdiag = np.array([len(np.where(corr_offdiag <= corr_thresholds[i])[0]) + for i in range(len(corr_thresholds))]) / len(corr_offdiag) + # p_diag should be 1 for most time steps and only decrease fast at the end if cluster is good + p_diag = np.array([len(np.where(corr_diag >= corr_thresholds[i])[0]) + for i in range(len(corr_thresholds))]) / len(corr_diag) + + corr_clustering_score = (p_diag.sum() + p_offdiag.sum()) / (2 * nb_steps) + + # silhouette score + # Check for clusterings where every sample is in their own cluster or where all samples are in one cluster. + # In that case we cannot use silhouette_samples because we have to satisfy cluster numbers to be in '2 to n_samples - 1 (inclusive)' + if (len(np.unique(idx_to_cluster_idx)) < len(idx_to_cluster_idx)) and (len(np.unique(idx_to_cluster_idx)) > 1): + + sample_silhouette_values = silhouette_samples(corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") + ssc = silhouette_score(corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") + + # TODO: Check this is really always true! + if ssc != np.sum(sample_silhouette_values)/len(sample_silhouette_values): + print('Error: Average not the same as computed score....') + + + # return clusters and mean silhouette coefficient + return [corr_clustering_score, ssc] + + +def compute_hierarch_clustering(corr_pairwdist, + methods=['single', 'complete', 'average', + 'weighted', 'centroid', 'median', + 'ward']): + """! Computes a hierarchical clustering for a (list of) method(s) and + provides the maximum cophenetic distance(s) as well as a score for the + clustering (see @method evaluate_clustering(...)). + + @param corr_pairwdist Computed pairwise distance between the features / data + set items. + @param method method or list of methods to compute the hierarchical + clustering. + + @return (List of) hierarchical clustering(s), maximum cophenetic distance(s) + and scores of the hierarchical clustering. + """ + + # NOTE: if changing method, pay attention to linkage methods; + # 'centroid', 'median', and 'ward' are correctly defined only if + # Euclidean pairwise metric is used in distance matrix that we used as input. + + # Based on the distances, we compute an hierarchical clustering for + # different methods + max_coph_corr_coeff = -1 + scores = dict() + # allow single entry + if not isinstance(methods, list): + methods = [methods] + # iterate over list + for method in methods: + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=method) + # compute cophenetic correlation distance/coefficient and cophenetic distance matrix + coph_corr_coeff, coph_dist_mat = hierarchy.cophenet( + cluster_hierarch, corr_pairwdist) + scores[method] = coph_corr_coeff + if coph_corr_coeff > max_coph_corr_coeff: + max_coph_corr_coeff = coph_corr_coeff + max_method = method + max_coph_dist_mat = coph_dist_mat + + # recompute hierarchical clustering with maximal cophenetic correlation distance/coefficient + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_method) + + print( + "Cophenetic correlation distance for method " + max_method + ": " + + str(max_coph_corr_coeff)) + + return cluster_hierarch, max_coph_dist_mat + + +def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, weights, method): + # TODO: Adapt comment on cophenetic distance and weights + """! Flattens a hierarchical clustering for a (list of) maximum cophenetic + distance(s) in the flat clusters and evaluates the resulting clustering with + respect to the corresponding correlation matrix. + + @param corr_mat correlation matrix between the features / data set items + clustered hierarchically. + @param corr_mat_pairw_dist Pairwise distance computed from correlation matrix. + @param cluster_hierarch hierarchical clustering of given features / data + set items. + @param weigths Maximum cophenetic distance or list of maximum cophenetic + distances to compute the flat clustering(s). + + @return flat clustering(s) according to the (list of) maximum distance(s). + """ + + # all indices in npis_corr from 0 to n-1 + npi_indices_all = set(range(corr_mat.shape[0])) + npi_idx_to_cluster_idx_list = [] + # allow single entries + if not isinstance(weights, list): + weights = [weights] + + # set total_eval_number to -1 for each weight because this is the minimal possible silhouette coefficient + total_eval_number = -1*np.ones(len(weights)) + n = 0 + # iterate over weights + for weight in weights: + # use the given weight to flatten the dendrogram + npi_idx_to_cluster_idx = hierarchy.fcluster( + cluster_hierarch, weight, criterion='distance') + + # start clustering labels with zero + if npi_idx_to_cluster_idx.min() == 1: + npi_idx_to_cluster_idx -= 1 + + # evaluate clustering + clusters, total_eval_number[n] = evaluate_clustering( + corr_mat, corr_mat_pairw_dist, npi_idx_to_cluster_idx, npi_indices_all) + + npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) + + # print scores on clustering + print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) + + # [total_eval_number.index(np.nanmin(np.array(total_eval_number)))] + + # return npi_idx_to_cluster_idx with highest mean silhouette coefficient + # if we get same value for multiple weights, take smallest weight + # TODO: Discuss if this is a good metric + return npi_idx_to_cluster_idx_list #[np.where(total_eval_number == np.nanmax(np.array(total_eval_number)))[0][0]] + + +def silhouette(pairw_dist_mat, cluster_labels, method): + + # if only one clustering is given, surround it by a list to make loop work. + if isinstance(cluster_labels, list) and ((isinstance(cluster_labels[0], int)) or (isinstance(cluster_labels[0], np.intc))): + cluster_labels = [cluster_labels] + + plt.figure() + + for clustering in cluster_labels: + + # Check for clusterings where every sample is in their own cluster or where all samples are in one cluster. + # In that case we cannot use silhouette_samples because we have to satisfy cluster numbers to be in '2 to n_samples - 1 (inclusive)' + if (len(np.unique(clustering)) < len(clustering)) and (len(clustering) == pairw_dist_mat.shape[0]) and \ + (len(np.unique(clustering)) > 1): + + sample_silhouette_values = silhouette_samples(pairw_dist_mat, clustering, metric="precomputed") + + y_lower = 10 + for i in range(n_clusters): + ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] + + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = cm.hsv(float(i) / n_clusters) + plt.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) + + plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) + + y_lower = y_upper + 10 + + plt.xlabel("The silhouette coefficient values") + plt.ylabel("Cluster label") + + plt.axvline(x=0, c='black', alpha=0.3) + plt.axvline(x=0.25, c='black', alpha=0.3) + plt.axvline(x=0.5, c='black', alpha=0.3) + plt.axvline(x=0.75, c='black', alpha=0.3) + + plt.yticks([]) # Clear the yaxis labels / ticks + plt.xticks([-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) + + plt.title( + "Silhouette analysis for " + label + + " clustering with n_clusters = " + str(n_clusters), + ) + plt.tight_layout() + plt.savefig('figures/silhouette_plot.png') + plt.show() + + else: + print("Dimension mismatch.") + + return sample_silhouette_values + + +def analyze_npi_data( + read_data, make_plot, fine_resolution, npis, directory, file_format, + npi_codes_considered): + + if not read_data: + # return df_npis, npis or remove else from below + gnpi.get_npi_data(fine_resolution=2, + file_format=dd.defaultDict['file_format'], + out_folder=dd.defaultDict['out_folder'], + start_date=dd.defaultDict['start_date'], + end_date=dd.defaultDict['end_date'], + make_plot=dd.defaultDict['make_plot'], + ) + + else: # read formatted file + npis = pd.read_excel( + os.path.join( + directory, 'datensatzbeschreibung_massnahmen.xlsx'), + sheet_name=3, engine='openpyxl') + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_incgrouped' + else: + filename = 'germany_counties_npi_subcat' + else: + filename = 'germany_counties_npi_maincat' + df_npis = pd.read_csv(directory + filename + ".csv") + try: + df_npis = df_npis.drop('Unnamed: 0', axis=1) + except KeyError: + pass + # df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) + # TODO: Check if we need the next line + npis = pd.read_json(os.path.join(directory, 'npis.json')) + # get code levels (main/subcodes) and position of main codes + + # check if any other integer than 0: not implemented or 1: implemented is + # used (maybe to specify the kind of implementation) + + if not npi_codes_considered: + npi_codes_considered = [ + x for x in npis[dd.EngEng['npiCode']] if len(x.split('_')) == 2] + df_merged = df_npis.iloc[:, :2] + for subcode in npi_codes_considered: + # extract columns which have the subcode as part of the column + # name and sum over all these subcodes + df_merged[subcode] = df_npis.filter( + regex=subcode).sum(axis=1) + + # make sure that only considered subcodes are in npis dataframe + npis = npis[npis[dd.EngEng['npiCode']].isin(npi_codes_considered)] + + # use copy on write of df_merged + df_npis = df_merged[:] + + if ~df_npis[npi_codes_considered].isin([0, 1]).any().any(): + + print("Info: Please ensure that NPI information is only boolean.") + + else: + # sum over different NPIs and plot share of counties implementing + # these NPIs versus counties without corresponding actions + df_npis_aggregated = df_npis.groupby( + dd.EngEng['date']).agg( + {i: sum for i in npi_codes_considered}).copy() + npis_total_sum = df_npis_aggregated.sum() + + npi_codes_empty = list(np.array(npi_codes_considered)[ + np.where(npis_total_sum == 0)[0]]) + + npi_unused_indices_all = [] + npi_used_indices_all = [] + npi_unused_indices = [] + npi_used_indices = [] + for i in range(len(npi_codes_considered)): + if npi_codes_considered[i] in npi_codes_empty: + npi_unused_indices.append(i) + npi_unused_indices_all.append( + np.where(npis[dd.EngEng['npiCode']] == npi_codes_considered[i])[0][0]) + else: + npi_used_indices.append(i) + npi_used_indices_all.append( + np.where(npis[dd.EngEng['npiCode']] == npi_codes_considered[i])[0][0]) + + npis_unused = np.array(npis['Description'])[ + npi_unused_indices_all] # for all fr1 codes: len=12 + npis_used = np.array(npis['Description'])[ + npi_used_indices_all] # for all fr1 codes: len=155 + npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) + npi_codes_unused = list( + np.array(npi_codes_considered)[npi_unused_indices]) + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'unused_subcats_incgrouped.txt' + else: + filename = 'unused_subcats.txt' + else: + filename = 'unused_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(npis_unused)): + file_npi.write(npi_codes_unused[i] + ": " + npis_unused[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + # open file to write used categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'used_subcats_incgrouped.txt' + else: + filename = 'used_subcats.txt' + else: + filename = 'used_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing used NPIs + for i in range(len(npis_used)): + file_npi.write(npi_codes_used[i] + ": " + npis_used[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + df_npis_used = df_npis[[dd.EngEng['date'], + dd.EngEng['idCounty']] + npi_codes_used].copy() + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_used_incgrouped' + else: + filename = 'germany_counties_npi_subcat_used' + else: + filename = 'germany_counties_npi_maincat_used' + gd.write_dataframe(df_npis_used, directory, filename, file_format) + + # compute correlations + npis_corr = df_npis_used.iloc[:, 2:].corr().values + # plot log-colored correlations + plt.imshow(abs(npis_corr), cmap='gray_r') + # plot histogram + plt.figure() + plt.hist(npis_corr.flatten(), bins=50) + plt.title("Correlation histogram", fontsize=18) + plt.xlabel("Correlation", fontsize=12) + plt.ylabel("Number of values", fontsize=12) + + # We understand the rows of npis_corr, the correlations of one NPI + # to the others as one node in the #NPIs-used-dimensional space. + # We compute the pairwise distances of these nodes. Then, nodes with + # similar correlations towards all other nodes exhibit small distances + corr_pairwdist = distance.pdist( + npis_corr, metric='euclidean') + + npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) + + # compute hierarchical clustering (via best-suited method) + compare_methods = False + if compare_methods: + + # centroid + method = 'centroid' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist_mat.max() + # TODO: Discuss why abs(npis_corr) is used as input and not corr_pairwdist + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, label=method) + + # ward + method = 'ward' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + max_coph_dist = coph_dist_mat.max() + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, label=method) + + # average + method = 'average' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + max_coph_dist = coph_dist_mat.max() + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) + + # centroid has less clusters + method = 'centroid' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + plt.savefig('figures/dendrogram.png') + plt.show() + max_coph_dist = coph_dist_mat.max() + + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.98, 0.98, 1)], method) + + silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) + + cluster_dict = dict() + cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max())] + cluster_desc = [[] for i in range(npi_idx_to_cluster_idx.max())] + for i in range(len(npi_idx_to_cluster_idx)): + cluster_dict[npi_codes_used[i] + ] = "CM_" + str(npi_idx_to_cluster_idx[i]-1).zfill(3) + cluster_codes[npi_idx_to_cluster_idx + [i]-1].append(npi_codes_used[i]) + cluster_desc[npi_idx_to_cluster_idx + [i]-1].append(str(npis_used[i])) + + # create clustered dataframe + df_npis_clustered = df_npis[[ + dd.EngEng['date'], dd.EngEng['idCounty']]].copy() + + for i in range(len(cluster_codes)): + df_npis_clustered["CM_" + str(i).zfill(3) + ] = df_npis[cluster_codes[i]].max(axis=1).copy() + + npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() + # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 + plt.imshow(abs(npis_corr_cluster), cmap='gray_r') + plt.title('Absolute correlation>0.25 of clustered NPIs') + plt.xlabel('NPI cluster') + plt.ylabel('NPI cluster') + plt.colorbar() + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'clusters_subcats_incgrouped.txt' + else: + filename = 'clusters_subcats.txt' + else: + filename = 'clusters_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(cluster_codes)): + file_npi.write("Cluster " + str(i) + "\n") + for j in range(len(cluster_codes[i])): + file_npi.write(cluster_codes[i][j] + ": " + cluster_desc[i][j]) + file_npi.write("\n") + file_npi.write("\n") + # Closing file + file_npi.close() + + npi_idx_new = np.argsort(npi_idx_to_cluster_idx) + npis_corr_reorder = npis_corr[npi_idx_new, :][:, npi_idx_new] + + plt.imshow(abs(npis_corr_reorder), cmap='gray_r') + plt.colorbar() + + # npi_indices_all = set(range(npis_corr.shape[0])) + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_npis = KMeans(n_clusters=i).fit(df_npis_used.iloc[:,2:].T) + # evaluate_clustering(npis_corr, kmeans_npis.labels_, npi_indices_all) + + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_corr = KMeans(n_clusters=i).fit(npis_corr) + # evaluate_clustering(npis_corr, kmeans_corr.labels_, npi_indices_all) + + # corr_threshold = 0.5 + # corr_indices_threshold = np.where(npis_corr > corr_threshold) + # npis_corr_threshold = np.zeros(npis_corr.shape) + # npis_corr_threshold[corr_indices_threshold] = npis_corr[corr_indices_threshold] + # plt.imshow(npis_corr_threshold, cmap='gray_r') + + # plot share of counties that implement the main categories + if make_plot: + # plot four different subsets of curves for better distinction + j = 0 + if fine_resolution > 0: + num_images = 15 + else: + num_images = 1 + for i in [ + slice( + int(len(npi_codes_used) / num_images) * i, + min( + int(len(npi_codes_used) / num_images) * + (i + 1), + len(npis_used))) for i in range( + num_images + 1)]: + # TODO: plotList doesn't work + customPlot.plotList(df_npis_aggregated.index, + [df_npis_aggregated[code] + for code in npi_codes_used[i]], + npis_used[i], + 'Counties implementing NPI main categories', + 'Date', 'Number', "Counties_NPI_main_" + + str(j) + "_of_"+str(num_images)) + plt.tight_layout() + j += 1 + + +def main(): + """! Main program entry.""" + + # arg_dict = gd.cli("testing") + fine_resolution = 2 + npis_final = [] + directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') + file_format = 'json' + npi_codes_considered = ['M01a_010', 'M01a_020', + 'M01a_100', 'M01a_110', 'M01a_120'] + analyze_npi_data(True, True, fine_resolution, npis_final, + directory, file_format, npi_codes_considered) + + +if __name__ == "__main__": + + main() From d3d1d5cfc575d821f020e0886d953d3f3a95c472 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 4 Apr 2024 10:51:30 +0200 Subject: [PATCH 17/30] flatten to output best clustering --- .../memilio/epidata/assessNPIEffects.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index a67a72c664..abe5e59446 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -29,10 +29,17 @@ from scipy.cluster import hierarchy from scipy.spatial import distance from sklearn.metrics import silhouette_samples, silhouette_score +from dataclasses import dataclass # activate CoW for more predictable behaviour of pandas DataFrames pd.options.mode.copy_on_write = True +@dataclass +class scored_cluster: + sc1: float # correlation-based cluster + sc2: float # silhouette score + clustering: np.ndarray # index to clustering array + def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indices_all): """! Computes a score for a particular clustering based on the @@ -164,7 +171,7 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, # all indices in npis_corr from 0 to n-1 npi_indices_all = set(range(corr_mat.shape[0])) - npi_idx_to_cluster_idx_list = [] + scored_clusterings = [] # allow single entries if not isinstance(weights, list): weights = [weights] @@ -183,20 +190,16 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, npi_idx_to_cluster_idx -= 1 # evaluate clustering - clusters, total_eval_number[n] = evaluate_clustering( + scores = evaluate_clustering( corr_mat, corr_mat_pairw_dist, npi_idx_to_cluster_idx, npi_indices_all) - npi_idx_to_cluster_idx_list.append(npi_idx_to_cluster_idx) - - # print scores on clustering - print("Number of clusters: " + str(int(np.nanmin(np.array(total_eval_number))))) - - # [total_eval_number.index(np.nanmin(np.array(total_eval_number)))] + scored_clusterings.append(scored_cluster(scores[0], scores[1], npi_idx_to_cluster_idx)) - # return npi_idx_to_cluster_idx with highest mean silhouette coefficient - # if we get same value for multiple weights, take smallest weight - # TODO: Discuss if this is a good metric - return npi_idx_to_cluster_idx_list #[np.where(total_eval_number == np.nanmax(np.array(total_eval_number)))[0][0]] + # TODO: get best clustering + pos_best_clustering_1 = np.nanargmax([s.sc1 for s in scored_clusterings]) + pos_best_clustering_2 = np.nanargmax([s.sc2 for s in scored_clusterings]) + pos_best_clustering_12 = np.nanargmax([s.sc1+s.sc2 for s in scored_clusterings]) + return scored_clusterings[pos_best_clustering_1].clustering def silhouette(pairw_dist_mat, cluster_labels, method): From 2ee788e994f95856ff475da3ed342b69e9c10359 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 4 Apr 2024 11:06:21 +0200 Subject: [PATCH 18/30] updated best clustering computation and todo comments --- pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index abe5e59446..9c24910848 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -90,6 +90,8 @@ def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indic # In that case we cannot use silhouette_samples because we have to satisfy cluster numbers to be in '2 to n_samples - 1 (inclusive)' if (len(np.unique(idx_to_cluster_idx)) < len(idx_to_cluster_idx)) and (len(np.unique(idx_to_cluster_idx)) > 1): + # TODO: Do we want to compute the silhouette score on the distance of npis_corr or abs(npis_corr)? + # Otherwise formulated: How do we work with negative correlations for the correlation distance? -1 = 1 ? sample_silhouette_values = silhouette_samples(corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") ssc = silhouette_score(corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") @@ -153,7 +155,6 @@ def compute_hierarch_clustering(corr_pairwdist, def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, weights, method): - # TODO: Adapt comment on cophenetic distance and weights """! Flattens a hierarchical clustering for a (list of) maximum cophenetic distance(s) in the flat clusters and evaluates the resulting clustering with respect to the corresponding correlation matrix. @@ -163,7 +164,7 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, @param corr_mat_pairw_dist Pairwise distance computed from correlation matrix. @param cluster_hierarch hierarchical clustering of given features / data set items. - @param weigths Maximum cophenetic distance or list of maximum cophenetic + @param weights Cophenetic distance or list of cophenetic distances to compute the flat clustering(s). @return flat clustering(s) according to the (list of) maximum distance(s). @@ -198,7 +199,7 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, # TODO: get best clustering pos_best_clustering_1 = np.nanargmax([s.sc1 for s in scored_clusterings]) pos_best_clustering_2 = np.nanargmax([s.sc2 for s in scored_clusterings]) - pos_best_clustering_12 = np.nanargmax([s.sc1+s.sc2 for s in scored_clusterings]) + pos_best_clustering_12 = np.nanargmax([s.sc1/scored_clusterings[pos_best_clustering_1].sc1 + s.sc2/scored_clusterings[pos_best_clustering_1].sc2 for s in scored_clusterings]) return scored_clusterings[pos_best_clustering_1].clustering From d4be5840a1cabe52d61e53e176a68150b32f1eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 4 Apr 2024 11:30:16 +0200 Subject: [PATCH 19/30] todos added --- .../memilio/epidata/assessNPIEffects.py | 64 +++++++++---------- 1 file changed, 31 insertions(+), 33 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 9c24910848..9a30c8b6f2 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -177,9 +177,6 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, if not isinstance(weights, list): weights = [weights] - # set total_eval_number to -1 for each weight because this is the minimal possible silhouette coefficient - total_eval_number = -1*np.ones(len(weights)) - n = 0 # iterate over weights for weight in weights: # use the given weight to flatten the dendrogram @@ -426,7 +423,7 @@ def analyze_npi_data( # compute hierarchical clustering (via best-suited method) compare_methods = False if compare_methods: - + # TODO: criterion to select from methods (output scores from flatten_...?!) # centroid method = 'centroid' cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( @@ -478,37 +475,38 @@ def analyze_npi_data( for wg in np.linspace(0.01, 1, 500)]) samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) + # end compare_methods - # centroid has less clusters - method = 'centroid' - cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( - corr_pairwdist, - method) - # plot dendrogram - plt.figure() - plt.title(method) - hierarchy.dendrogram(cluster_hierarch) - plt.savefig('figures/dendrogram.png') - plt.show() - max_coph_dist = coph_dist_mat.max() - - npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, - [wg * max_coph_dist - for wg in np.linspace(0.98, 0.98, 1)], method) - - silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) - - cluster_dict = dict() - cluster_codes = [[] for i in range(npi_idx_to_cluster_idx.max())] - cluster_desc = [[] for i in range(npi_idx_to_cluster_idx.max())] + else: # simplified, centroid had less clusters in example + method = 'centroid' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + plt.savefig('figures/dendrogram.png') + plt.show() + max_coph_dist = coph_dist_mat.max() + + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.98, 0.98, 1)], method) # TODO reset to 0.01 to 1 with 500 samples + + # silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) + + cluster_dict = dict() # TODO: Remove because not used? or use to write output? + cluster_codes = [[] for i in range(len(np.unique(npi_idx_to_cluster_idx)))] + cluster_desc = [[] for i in range(len(np.unique(npi_idx_to_cluster_idx)))] for i in range(len(npi_idx_to_cluster_idx)): cluster_dict[npi_codes_used[i] - ] = "CM_" + str(npi_idx_to_cluster_idx[i]-1).zfill(3) + ] = "CM_" + str(npi_idx_to_cluster_idx[i]).zfill(3) # use CM for "Cluster Massnahme" cluster_codes[npi_idx_to_cluster_idx - [i]-1].append(npi_codes_used[i]) + [i]].append(npi_codes_used[i]) cluster_desc[npi_idx_to_cluster_idx - [i]-1].append(str(npis_used[i])) + [i]].append(str(npis_used[i])) # create clustered dataframe df_npis_clustered = df_npis[[ @@ -516,12 +514,12 @@ def analyze_npi_data( for i in range(len(cluster_codes)): df_npis_clustered["CM_" + str(i).zfill(3) - ] = df_npis[cluster_codes[i]].max(axis=1).copy() + ] = df_npis[cluster_codes[i]].max(axis=1).copy() # TODO take weighted average instead of maximum?? - npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() + npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() # TODO why were 2 and 4 clustered in the example?? # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 plt.imshow(abs(npis_corr_cluster), cmap='gray_r') - plt.title('Absolute correlation>0.25 of clustered NPIs') + plt.title('Absolute correlation of clustered NPIs') plt.xlabel('NPI cluster') plt.ylabel('NPI cluster') plt.colorbar() From 254183316092a950212e924e9d8ead86780eff5e Mon Sep 17 00:00:00 2001 From: patricklnz Date: Thu, 25 Apr 2024 12:58:20 +0200 Subject: [PATCH 20/30] Write clustering to file --- .../memilio/epidata/assessNPIEffects.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 9a30c8b6f2..8bcd0fe45e 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -268,6 +268,9 @@ def silhouette(pairw_dist_mat, cluster_labels, method): def analyze_npi_data( read_data, make_plot, fine_resolution, npis, directory, file_format, npi_codes_considered): + + # TODO: Error in custom plot list + make_plot = False if not read_data: # return df_npis, npis or remove else from below @@ -549,6 +552,8 @@ def analyze_npi_data( plt.imshow(abs(npis_corr_reorder), cmap='gray_r') plt.colorbar() + write_clustered_npis(df_npis, 'median', cluster_codes, npis_corr, directory, npi_codes_used) + # npi_indices_all = set(range(npis_corr.shape[0])) # for i in [40]:#[10, 20, 40, 80, 160]: # kmeans_npis = KMeans(n_clusters=i).fit(df_npis_used.iloc[:,2:].T) @@ -591,6 +596,45 @@ def analyze_npi_data( plt.tight_layout() j += 1 +def write_clustered_npis(df_npis: pd.DataFrame, method:str, cluster_codes: list[list[str]], + npis_corr: np.ndarray, directory: str, npi_codes_used:np.ndarray): + cluster_dict = dict() + #only compute cluster with more than two items + cluster_to_combine = [item for item in cluster_codes if len(item)>1] + name_id = 0 + for cl in cluster_to_combine: + # get index of cluster items to check if they are negative or positive correlated + cluster_idx = [np.where(npi_codes_used == cl_code)[0][0] for cl_code in cl] + for id_npi in cluster_idx: + if npis_corr[cluster_idx[0], id_npi] < 0: + # switch npis 0 -> 1 and 1 -> 0 + npi_to_switch = npi_codes_used[id_npi] + df_npis[npi_to_switch]-=1 + df_npis[npi_to_switch]*=-1 + # copy values and delete item from dataframe + values = [df_npis[cl_code].values for cl_code in cl] + df_npis.drop(cl, axis=1, inplace = True) + #name clusters #TODO: how? For now cluster_0,1,2... + cluster_name = 'cluster_' + str(name_id) + # write cluster names in a dict (and to json) to reconstruct clusters after regression + cluster_dict[cluster_name] = cl + # TODO: how should the cluster values for the regression be calculated + if method == 'mean': + cluster_value = np.array(values).mean(axis=0) + if method == 'median': + cluster_value = np.median(np.array(values), axis=0) + df_npis[cluster_name] = cluster_value + name_id+=1 + #write npis with clusters + filename = 'clustered_npis' + file_format = 'json' + gd.write_dataframe(df_npis, directory, filename, file_format) + #write cluster dict + filename = 'cluster_description' + gd.write_dataframe(df_npis, directory, filename, file_format) + + + def main(): """! Main program entry.""" From a81b5b135f0515758e9ea0832d24c22281ca0144 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Thu, 25 Apr 2024 14:38:08 +0200 Subject: [PATCH 21/30] write to txt --- pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 8bcd0fe45e..ae9c03144e 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -631,9 +631,8 @@ def write_clustered_npis(df_npis: pd.DataFrame, method:str, cluster_codes: list[ gd.write_dataframe(df_npis, directory, filename, file_format) #write cluster dict filename = 'cluster_description' - gd.write_dataframe(df_npis, directory, filename, file_format) - - + with open(directory+filename, 'w') as f: + print(cluster_dict, file=f) def main(): @@ -647,7 +646,7 @@ def main(): npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100', 'M01a_110', 'M01a_120'] analyze_npi_data(True, True, fine_resolution, npis_final, - directory, file_format, npi_codes_considered) + directory, file_format, False) if __name__ == "__main__": From df9b35ab217ce0f451763d94929e2ea90a235a91 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 29 Apr 2024 08:39:14 +0200 Subject: [PATCH 22/30] add optional absolute usage of npis_corr --- .../memilio/epidata/assessNPIEffects.py | 50 ++++++++++++------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index ae9c03144e..a2657feb37 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -20,6 +20,7 @@ import os import matplotlib.pyplot as plt import matplotlib.cm as cm +import matplotlib as mpl import numpy as np import pandas as pd from memilio.epidata import customPlot @@ -34,6 +35,8 @@ # activate CoW for more predictable behaviour of pandas DataFrames pd.options.mode.copy_on_write = True +mpl.use('Qtagg') + @dataclass class scored_cluster: sc1: float # correlation-based cluster @@ -267,7 +270,7 @@ def silhouette(pairw_dist_mat, cluster_labels, method): def analyze_npi_data( read_data, make_plot, fine_resolution, npis, directory, file_format, - npi_codes_considered): + npi_codes_considered, abs_correlation): # TODO: Error in custom plot list make_plot = False @@ -405,11 +408,23 @@ def analyze_npi_data( # compute correlations npis_corr = df_npis_used.iloc[:, 2:].corr().values + npis_corr_abs = abs(npis_corr) + # define which correlation matrix should be used based on input variable + # also adjust colormaps + if abs_correlation: + npis_corr_mat = npis_corr_abs + vmin = 0 + cmap = 'gray_r' + else: + npis_corr_mat = npis_corr + vmin = -1 + cmap = 'RdGy' # plot log-colored correlations - plt.imshow(abs(npis_corr), cmap='gray_r') + plt.imshow(npis_corr_mat, cmap=cmap, vmin=vmin, vmax=1) + plt.colorbar() # plot histogram plt.figure() - plt.hist(npis_corr.flatten(), bins=50) + plt.hist(npis_corr_mat.flatten(), bins=50) plt.title("Correlation histogram", fontsize=18) plt.xlabel("Correlation", fontsize=12) plt.ylabel("Number of values", fontsize=12) @@ -419,7 +434,7 @@ def analyze_npi_data( # We compute the pairwise distances of these nodes. Then, nodes with # similar correlations towards all other nodes exhibit small distances corr_pairwdist = distance.pdist( - npis_corr, metric='euclidean') + npis_corr_mat, metric='euclidean') npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) @@ -438,9 +453,9 @@ def analyze_npi_data( hierarchy.dendrogram(cluster_hierarch) # plt.show() max_coph_dist = coph_dist_mat.max() - # TODO: Discuss why abs(npis_corr) is used as input and not corr_pairwdist + # TODO: Discuss why npis_corr is used as input and not corr_pairwdist npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) @@ -457,7 +472,7 @@ def analyze_npi_data( hierarchy.dendrogram(cluster_hierarch) max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, label=method) @@ -473,7 +488,7 @@ def analyze_npi_data( hierarchy.dendrogram(cluster_hierarch) max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) @@ -490,11 +505,11 @@ def analyze_npi_data( plt.title(method) hierarchy.dendrogram(cluster_hierarch) plt.savefig('figures/dendrogram.png') - plt.show() + #plt.show() max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.98, 0.98, 1)], method) # TODO reset to 0.01 to 1 with 500 samples @@ -517,11 +532,12 @@ def analyze_npi_data( for i in range(len(cluster_codes)): df_npis_clustered["CM_" + str(i).zfill(3) - ] = df_npis[cluster_codes[i]].max(axis=1).copy() # TODO take weighted average instead of maximum?? + ] = df_npis[cluster_codes[i]].mean(axis=1).copy() # TODO take weighted average instead of maximum?? npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() # TODO why were 2 and 4 clustered in the example?? # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 - plt.imshow(abs(npis_corr_cluster), cmap='gray_r') + plt.figure() + plt.imshow(npis_corr_cluster, cmap=cmap, vmin=vmin, vmax=1) plt.title('Absolute correlation of clustered NPIs') plt.xlabel('NPI cluster') plt.ylabel('NPI cluster') @@ -547,9 +563,10 @@ def analyze_npi_data( file_npi.close() npi_idx_new = np.argsort(npi_idx_to_cluster_idx) - npis_corr_reorder = npis_corr[npi_idx_new, :][:, npi_idx_new] + npis_corr_reorder = npis_corr_mat[npi_idx_new, :][:, npi_idx_new] - plt.imshow(abs(npis_corr_reorder), cmap='gray_r') + plt.figure() + plt.imshow(npis_corr_reorder, cmap=cmap, vmin=vmin, vmax=1) plt.colorbar() write_clustered_npis(df_npis, 'median', cluster_codes, npis_corr, directory, npi_codes_used) @@ -638,7 +655,6 @@ def write_clustered_npis(df_npis: pd.DataFrame, method:str, cluster_codes: list[ def main(): """! Main program entry.""" - # arg_dict = gd.cli("testing") fine_resolution = 2 npis_final = [] directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') @@ -646,9 +662,9 @@ def main(): npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100', 'M01a_110', 'M01a_120'] analyze_npi_data(True, True, fine_resolution, npis_final, - directory, file_format, False) + directory, file_format, False, True) if __name__ == "__main__": - main() + main() \ No newline at end of file From f2d393b0e8bb3824d6b38d1a97676386ea1b9cb9 Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 29 Apr 2024 10:45:31 +0200 Subject: [PATCH 23/30] repair silhouette plot --- .../memilio/epidata/assessNPIEffects.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index a2657feb37..5c898d518e 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -203,15 +203,18 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, return scored_clusterings[pos_best_clustering_1].clustering -def silhouette(pairw_dist_mat, cluster_labels, method): +def silhouette(pairw_dist_mat, cluster_labels): # if only one clustering is given, surround it by a list to make loop work. - if isinstance(cluster_labels, list) and ((isinstance(cluster_labels[0], int)) or (isinstance(cluster_labels[0], np.intc))): + if (isinstance(cluster_labels, list) or isinstance(cluster_labels, np.ndarray)) and ((isinstance(cluster_labels[0], int)) or (isinstance(cluster_labels[0], np.intc))): cluster_labels = [cluster_labels] plt.figure() for clustering in cluster_labels: + + # get number of clusters (naming starting with 0,1,2...) + n_clusters = clustering.max()+1 # Check for clusterings where every sample is in their own cluster or where all samples are in one cluster. # In that case we cannot use silhouette_samples because we have to satisfy cluster numbers to be in '2 to n_samples - 1 (inclusive)' @@ -222,7 +225,7 @@ def silhouette(pairw_dist_mat, cluster_labels, method): y_lower = 10 for i in range(n_clusters): - ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] + ith_cluster_silhouette_values = sample_silhouette_values[clustering == i] ith_cluster_silhouette_values.sort() @@ -255,12 +258,12 @@ def silhouette(pairw_dist_mat, cluster_labels, method): plt.xticks([-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) plt.title( - "Silhouette analysis for " + label + + "Silhouette analysis for" + " clustering with n_clusters = " + str(n_clusters), ) plt.tight_layout() plt.savefig('figures/silhouette_plot.png') - plt.show() + #plt.show() else: print("Dimension mismatch.") @@ -270,7 +273,7 @@ def silhouette(pairw_dist_mat, cluster_labels, method): def analyze_npi_data( read_data, make_plot, fine_resolution, npis, directory, file_format, - npi_codes_considered, abs_correlation): + npi_codes_considered, abs_correlation, cluster_function): # TODO: Error in custom plot list make_plot = False @@ -511,9 +514,9 @@ def analyze_npi_data( npi_idx_to_cluster_idx = flatten_hierarch_clustering( npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.98, 0.98, 1)], method) # TODO reset to 0.01 to 1 with 500 samples + for wg in np.linspace(0.2, 1, 500)], method) # TODO reset to 0.01 to 1 with 500 samples - # silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) + silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx) cluster_dict = dict() # TODO: Remove because not used? or use to write output? cluster_codes = [[] for i in range(len(np.unique(npi_idx_to_cluster_idx)))] @@ -532,7 +535,7 @@ def analyze_npi_data( for i in range(len(cluster_codes)): df_npis_clustered["CM_" + str(i).zfill(3) - ] = df_npis[cluster_codes[i]].mean(axis=1).copy() # TODO take weighted average instead of maximum?? + ] = cluster_function(df_npis[cluster_codes[i]], axis=1) npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() # TODO why were 2 and 4 clustered in the example?? # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 @@ -567,9 +570,10 @@ def analyze_npi_data( plt.figure() plt.imshow(npis_corr_reorder, cmap=cmap, vmin=vmin, vmax=1) + plt.title('Correlation of reordered NPIs') plt.colorbar() - write_clustered_npis(df_npis, 'median', cluster_codes, npis_corr, directory, npi_codes_used) + write_clustered_npis(df_npis, cluster_function, cluster_codes, npis_corr_mat, directory, npi_codes_used) # npi_indices_all = set(range(npis_corr.shape[0])) # for i in [40]:#[10, 20, 40, 80, 160]: @@ -613,8 +617,7 @@ def analyze_npi_data( plt.tight_layout() j += 1 -def write_clustered_npis(df_npis: pd.DataFrame, method:str, cluster_codes: list[list[str]], - npis_corr: np.ndarray, directory: str, npi_codes_used:np.ndarray): +def write_clustered_npis(df_npis, cluster_function, cluster_codes, npis_corr, directory, npi_codes_used): cluster_dict = dict() #only compute cluster with more than two items cluster_to_combine = [item for item in cluster_codes if len(item)>1] @@ -636,10 +639,7 @@ def write_clustered_npis(df_npis: pd.DataFrame, method:str, cluster_codes: list[ # write cluster names in a dict (and to json) to reconstruct clusters after regression cluster_dict[cluster_name] = cl # TODO: how should the cluster values for the regression be calculated - if method == 'mean': - cluster_value = np.array(values).mean(axis=0) - if method == 'median': - cluster_value = np.median(np.array(values), axis=0) + cluster_value = cluster_function(np.array(values), axis=0) df_npis[cluster_name] = cluster_value name_id+=1 #write npis with clusters @@ -662,7 +662,7 @@ def main(): npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100', 'M01a_110', 'M01a_120'] analyze_npi_data(True, True, fine_resolution, npis_final, - directory, file_format, False, True) + directory, file_format, False, True, np.mean) if __name__ == "__main__": From e5a744a79eff7d07e33e84c0ddd320f6a8772851 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Joachim=20K=C3=BChn?= Date: Thu, 16 May 2024 10:19:50 +0200 Subject: [PATCH 24/30] start clustering rki --- pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt diff --git a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt new file mode 100644 index 0000000000..87b9041656 --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt @@ -0,0 +1,2 @@ +Contacts in private spaces: [['M01a_020'], ['M01a_080', 'M01a_090','M01a_100'], ['M01a_110','M01a_120'], ['M01a_130','M01a_140','M01a_150']] +Contacts in public spaces: [['M01b_020'], ['M01b_080', 'M01b_090','M01b_100'], ['M01b_110','M01b_120'], ['M01b_130','M01b_140','M01b_150']] From f1e72309f72ae2fc6bddd5ebf7ac7294dfcf6ba4 Mon Sep 17 00:00:00 2001 From: patricklnz <93600225+patricklnz@users.noreply.github.com> Date: Thu, 16 May 2024 14:02:41 +0200 Subject: [PATCH 25/30] Update npi_clustering_rki.txt --- .../memilio/epidata/npi_clustering_rki.txt | 172 +++++++++++++++++- 1 file changed, 170 insertions(+), 2 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt index 87b9041656..b032abc5c9 100644 --- a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt +++ b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt @@ -1,2 +1,170 @@ -Contacts in private spaces: [['M01a_020'], ['M01a_080', 'M01a_090','M01a_100'], ['M01a_110','M01a_120'], ['M01a_130','M01a_140','M01a_150']] -Contacts in public spaces: [['M01b_020'], ['M01b_080', 'M01b_090','M01b_100'], ['M01b_110','M01b_120'], ['M01b_130','M01b_140','M01b_150']] +Contacts in private spaces: [ + [], + ['M01a_020'], + ['M01a_080', 'M01a_090', 'M01a_100'], + ['M01a_110', 'M01a_120'], + ['M01a_130', 'M01a_140', 'M01a_150'], + [], +] + +Contacts in public spaces: [ + [], + ['M01b_020'], + ['M01b_080', 'M01b_090', 'M01b_100'], + ['M01b_110', 'M01b_120'], + ['M01b_130', 'M01b_140', 'M01b_150'], + [], +] + +Further Education Schools: [ + [], + ['M02a_010'], + ['M02a_020'], + ['M02a_030', 'M02a_031', 'M02a_032', 'M02a_033', 'M02a_034', 'M02a_035', 'M02a_040'], + [], + [], +] + +Primary Schools: [ + [], + ['M02b_010'], + ['M02b_020'], + ['M02b_030', 'M02b_031', 'M02b_032', 'M02b_033', 'M02b_034', 'M02b_036', 'M02b_035', 'M02b_040'], + [], + [], +] + +Cultural and Educational Institutions: [ + [], + ['M06_020'], + ['M06_030', 'M06_031', 'M06_032', 'M06_033', 'M06_034'], + ['M06_040'], + [], + [], +] + +Retail: [ + [], + ['M07_020'], + ['M07_030', 'M07_031', 'M07_032', 'M07_033', 'M07_034', 'M07_036'], + ['M07_040'], + [], + [], +] + +Hospitality: [ + [], + ['M08_020'], + ['M08_030', 'M08_031', 'M08_032', 'M08_033'], + ['M08_040'], + [], + [], +] + +Services and Crafts: [ + [], + ['M09_020'], + ['M09_030', 'M09_031', 'M09_032', 'M09_033'], + ['M09_040'], + [], + [], +] + +Nightlife Establishments: [ + [], + ['M10_020'], + ['M10_030', 'M10_031', 'M10_032', 'M10_033'], + ['M10_040'], + [], + [], +] + +Accommodation: [ + [], + ['M11_020'], + ['M11_030'], + ['M11_040'], + [], + [], +] + +Indoor Sports: [ + [], + ['M12_020'], + ['M12_030'], + ['M12_040'], + ['M12_070'], + [], +] + +Outdoor Sports: [ + [], + ['M13_020'], + ['M13_030'], + ['M13_040'], + ['M13_070'], + [], +] + +Domestic Travel: [ + [], + ['M14_010'], + ['M14_020'], + ['M14_030'], + [], + [], +] + +International Travel: [ + [], + ['M15_010'], + ['M15_020'], + ['M15_030'], + [], + [], +] + +Masking: [ + ['M16_010', 'M16_020'], + ['M16_030', 'M16_040', 'M16_050', 'M16_060'], + ['M16_070', 'M16_080', 'M16_090'], + [], + [], + ['M16_100'], +] + +Workplace: [ + [], + ['M17_010'], + ['M17_020'], + ['M17_030'], + [], + [], +] + +Curfew: [ + [], + ['M18_020'], + ['M18_030'], + ['M18_040'], + [], + [], +] + +Physical Distancing: [ + ['M20_010'], + ['M20_020'], + [], + [], + [], + [], +] + +Testing: [ + ['M21_010'], + ['M21_020'], + ['M21_030'], + ['M21_040'], + ['M21_050'], + [], +] From 472bed03d7ea4e45d5b51d9cd1366ff9edfb8e2a Mon Sep 17 00:00:00 2001 From: Anna Wendler <106674756+annawendler@users.noreply.github.com> Date: Wed, 22 May 2024 12:12:33 +0200 Subject: [PATCH 26/30] write rki clustering --- .../memilio/epidata/assessNPIEffects.py | 183 +++++++--- .../epidata/npi_clustering_rki_manual.json | 327 ++++++++++++++++++ 2 files changed, 457 insertions(+), 53 deletions(-) create mode 100644 pycode/memilio-epidata/memilio/epidata/npi_clustering_rki_manual.json diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 5c898d518e..63a6185a1c 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -18,6 +18,7 @@ # limitations under the License. ############################################################################# import os +import json import matplotlib.pyplot as plt import matplotlib.cm as cm import matplotlib as mpl @@ -37,11 +38,12 @@ mpl.use('Qtagg') + @dataclass class scored_cluster: - sc1: float # correlation-based cluster - sc2: float # silhouette score - clustering: np.ndarray # index to clustering array + sc1: float # correlation-based cluster + sc2: float # silhouette score + clustering: np.ndarray # index to clustering array def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indices_all): @@ -61,9 +63,11 @@ def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indic """ # store indices of clusters idx_orig = np.array(range(len(idx_to_cluster_idx))) - clusters = [idx_orig[idx_to_cluster_idx == i] for i in range(idx_to_cluster_idx.max()+1)] + clusters = [idx_orig[idx_to_cluster_idx == i] + for i in range(idx_to_cluster_idx.max()+1)] # store remaining/perpendicular indices for all clusters - clusters_perp = [idx_orig[idx_to_cluster_idx != i] for i in range(idx_to_cluster_idx.max()+1)] + clusters_perp = [idx_orig[idx_to_cluster_idx != i] + for i in range(idx_to_cluster_idx.max()+1)] ssc = np.nan # own score on diagonal and offdiagonal correlation values @@ -76,16 +80,16 @@ def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indic corr_offdiag = np.append(corr_offdiag, abs( corr_mat[np.ix_(clusters[ii], clusters_perp[ii])].flatten())) - step_width=0.02 + step_width = 0.02 nb_steps = int(1/step_width) corr_thresholds = np.linspace(0+step_width, 1, nb_steps) # p_offdiag should increase fast to 1 if cluster is good p_offdiag = np.array([len(np.where(corr_offdiag <= corr_thresholds[i])[0]) - for i in range(len(corr_thresholds))]) / len(corr_offdiag) + for i in range(len(corr_thresholds))]) / len(corr_offdiag) # p_diag should be 1 for most time steps and only decrease fast at the end if cluster is good p_diag = np.array([len(np.where(corr_diag >= corr_thresholds[i])[0]) - for i in range(len(corr_thresholds))]) / len(corr_diag) - + for i in range(len(corr_thresholds))]) / len(corr_diag) + corr_clustering_score = (p_diag.sum() + p_offdiag.sum()) / (2 * nb_steps) # silhouette score @@ -95,14 +99,15 @@ def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indic # TODO: Do we want to compute the silhouette score on the distance of npis_corr or abs(npis_corr)? # Otherwise formulated: How do we work with negative correlations for the correlation distance? -1 = 1 ? - sample_silhouette_values = silhouette_samples(corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") - ssc = silhouette_score(corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") + sample_silhouette_values = silhouette_samples( + corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") + ssc = silhouette_score(corr_mat_pairw_dist, + idx_to_cluster_idx, metric="precomputed") # TODO: Check this is really always true! if ssc != np.sum(sample_silhouette_values)/len(sample_silhouette_values): print('Error: Average not the same as computed score....') - # return clusters and mean silhouette coefficient return [corr_clustering_score, ssc] @@ -185,7 +190,7 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, # use the given weight to flatten the dendrogram npi_idx_to_cluster_idx = hierarchy.fcluster( cluster_hierarch, weight, criterion='distance') - + # start clustering labels with zero if npi_idx_to_cluster_idx.min() == 1: npi_idx_to_cluster_idx -= 1 @@ -193,13 +198,15 @@ def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, # evaluate clustering scores = evaluate_clustering( corr_mat, corr_mat_pairw_dist, npi_idx_to_cluster_idx, npi_indices_all) - - scored_clusterings.append(scored_cluster(scores[0], scores[1], npi_idx_to_cluster_idx)) + + scored_clusterings.append(scored_cluster( + scores[0], scores[1], npi_idx_to_cluster_idx)) # TODO: get best clustering pos_best_clustering_1 = np.nanargmax([s.sc1 for s in scored_clusterings]) pos_best_clustering_2 = np.nanargmax([s.sc2 for s in scored_clusterings]) - pos_best_clustering_12 = np.nanargmax([s.sc1/scored_clusterings[pos_best_clustering_1].sc1 + s.sc2/scored_clusterings[pos_best_clustering_1].sc2 for s in scored_clusterings]) + pos_best_clustering_12 = np.nanargmax([s.sc1/scored_clusterings[pos_best_clustering_1].sc1 + + s.sc2/scored_clusterings[pos_best_clustering_1].sc2 for s in scored_clusterings]) return scored_clusterings[pos_best_clustering_1].clustering @@ -212,16 +219,17 @@ def silhouette(pairw_dist_mat, cluster_labels): plt.figure() for clustering in cluster_labels: - + # get number of clusters (naming starting with 0,1,2...) n_clusters = clustering.max()+1 # Check for clusterings where every sample is in their own cluster or where all samples are in one cluster. # In that case we cannot use silhouette_samples because we have to satisfy cluster numbers to be in '2 to n_samples - 1 (inclusive)' if (len(np.unique(clustering)) < len(clustering)) and (len(clustering) == pairw_dist_mat.shape[0]) and \ - (len(np.unique(clustering)) > 1): + (len(np.unique(clustering)) > 1): - sample_silhouette_values = silhouette_samples(pairw_dist_mat, clustering, metric="precomputed") + sample_silhouette_values = silhouette_samples( + pairw_dist_mat, clustering, metric="precomputed") y_lower = 10 for i in range(n_clusters): @@ -263,18 +271,18 @@ def silhouette(pairw_dist_mat, cluster_labels): ) plt.tight_layout() plt.savefig('figures/silhouette_plot.png') - #plt.show() + # plt.show() else: - print("Dimension mismatch.") + print("Dimension mismatch.") return sample_silhouette_values def analyze_npi_data( read_data, make_plot, fine_resolution, npis, directory, file_format, - npi_codes_considered, abs_correlation, cluster_function): - + npi_codes_considered, abs_correlation, cluster_function, manual_rki_clustering=False): + # TODO: Error in custom plot list make_plot = False @@ -441,6 +449,11 @@ def analyze_npi_data( npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) + if manual_rki_clustering: + write_cluster_rki(df_npis, cluster_function, + npis_corr, directory, npi_codes_used) + return + # compute hierarchical clustering (via best-suited method) compare_methods = False if compare_methods: @@ -458,11 +471,13 @@ def analyze_npi_data( max_coph_dist = coph_dist_mat.max() # TODO: Discuss why npis_corr is used as input and not corr_pairwdist npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, label=method) + samples = silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx, label=method) # ward method = 'ward' @@ -475,10 +490,12 @@ def analyze_npi_data( hierarchy.dendrogram(cluster_hierarch) max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, label=method) + samples = silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx, label=method) # average method = 'average' @@ -491,14 +508,16 @@ def analyze_npi_data( hierarchy.dendrogram(cluster_hierarch) max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) - samples = silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx, method=method) + samples = silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx, method=method) # end compare_methods - else: # simplified, centroid had less clusters in example + else: # simplified, centroid had less clusters in example method = 'centroid' cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( corr_pairwdist, @@ -508,22 +527,26 @@ def analyze_npi_data( plt.title(method) hierarchy.dendrogram(cluster_hierarch) plt.savefig('figures/dendrogram.png') - #plt.show() + # plt.show() max_coph_dist = coph_dist_mat.max() npi_idx_to_cluster_idx = flatten_hierarch_clustering( - npis_corr_mat, distance.squareform(corr_pairwdist), cluster_hierarch, + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, [wg * max_coph_dist - for wg in np.linspace(0.2, 1, 500)], method) # TODO reset to 0.01 to 1 with 500 samples + for wg in np.linspace(0.2, 1, 500)], method) # TODO reset to 0.01 to 1 with 500 samples - silhouette(distance.squareform(corr_pairwdist), npi_idx_to_cluster_idx) + silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx) - cluster_dict = dict() # TODO: Remove because not used? or use to write output? - cluster_codes = [[] for i in range(len(np.unique(npi_idx_to_cluster_idx)))] - cluster_desc = [[] for i in range(len(np.unique(npi_idx_to_cluster_idx)))] + cluster_dict = dict() # TODO: Remove because not used? or use to write output? + cluster_codes = [[] + for i in range(len(np.unique(npi_idx_to_cluster_idx)))] + cluster_desc = [[] + for i in range(len(np.unique(npi_idx_to_cluster_idx)))] for i in range(len(npi_idx_to_cluster_idx)): cluster_dict[npi_codes_used[i] - ] = "CM_" + str(npi_idx_to_cluster_idx[i]).zfill(3) # use CM for "Cluster Massnahme" + ] = "CM_" + str(npi_idx_to_cluster_idx[i]).zfill(3) # use CM for "Cluster Massnahme" cluster_codes[npi_idx_to_cluster_idx [i]].append(npi_codes_used[i]) cluster_desc[npi_idx_to_cluster_idx @@ -537,7 +560,8 @@ def analyze_npi_data( df_npis_clustered["CM_" + str(i).zfill(3) ] = cluster_function(df_npis[cluster_codes[i]], axis=1) - npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() # TODO why were 2 and 4 clustered in the example?? + # TODO why were 2 and 4 clustered in the example?? + npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 plt.figure() plt.imshow(npis_corr_cluster, cmap=cmap, vmin=vmin, vmax=1) @@ -573,7 +597,8 @@ def analyze_npi_data( plt.title('Correlation of reordered NPIs') plt.colorbar() - write_clustered_npis(df_npis, cluster_function, cluster_codes, npis_corr_mat, directory, npi_codes_used) + write_clustered_npis(df_npis, cluster_function, cluster_codes, + npis_corr_mat, directory, npi_codes_used) # npi_indices_all = set(range(npis_corr.shape[0])) # for i in [40]:#[10, 20, 40, 80, 160]: @@ -617,41 +642,92 @@ def analyze_npi_data( plt.tight_layout() j += 1 + def write_clustered_npis(df_npis, cluster_function, cluster_codes, npis_corr, directory, npi_codes_used): cluster_dict = dict() - #only compute cluster with more than two items - cluster_to_combine = [item for item in cluster_codes if len(item)>1] + # only compute cluster with more than two items + cluster_to_combine = [item for item in cluster_codes if len(item) > 1] name_id = 0 for cl in cluster_to_combine: # get index of cluster items to check if they are negative or positive correlated - cluster_idx = [np.where(npi_codes_used == cl_code)[0][0] for cl_code in cl] + cluster_idx = [np.where(npi_codes_used == cl_code)[0][0] + for cl_code in cl] for id_npi in cluster_idx: if npis_corr[cluster_idx[0], id_npi] < 0: # switch npis 0 -> 1 and 1 -> 0 npi_to_switch = npi_codes_used[id_npi] - df_npis[npi_to_switch]-=1 - df_npis[npi_to_switch]*=-1 + df_npis[npi_to_switch] -= 1 + df_npis[npi_to_switch] *= -1 # copy values and delete item from dataframe values = [df_npis[cl_code].values for cl_code in cl] - df_npis.drop(cl, axis=1, inplace = True) - #name clusters #TODO: how? For now cluster_0,1,2... + df_npis.drop(cl, axis=1, inplace=True) + # name clusters #TODO: how? For now cluster_0,1,2... cluster_name = 'cluster_' + str(name_id) # write cluster names in a dict (and to json) to reconstruct clusters after regression cluster_dict[cluster_name] = cl # TODO: how should the cluster values for the regression be calculated cluster_value = cluster_function(np.array(values), axis=0) df_npis[cluster_name] = cluster_value - name_id+=1 - #write npis with clusters + name_id += 1 + # write npis with clusters filename = 'clustered_npis' file_format = 'json' gd.write_dataframe(df_npis, directory, filename, file_format) - #write cluster dict + # write cluster dict filename = 'cluster_description' with open(directory+filename, 'w') as f: print(cluster_dict, file=f) +def write_cluster_rki(df_npis, cluster_function, npis_corr, directory, npi_codes_used): + cluster_dict = dict() + # only compute cluster with more than two items + # Read the JSON file + with open(os.path.join(directory, 'npi_clustering_rki_manual.json')) as json_file: + data = json.load(json_file) + + # Convert JSON data to a DataFrame + df_rki_clustering = pd.DataFrame.from_dict( + data, orient='index').transpose() + + cluster_codes = [list(item) for sublist in df_rki_clustering.values.tolist() + for item in sublist] + # remove empty lists from cluster_codes + cluster_codes = [x for x in cluster_codes if x] + + cluster_to_combine = [item for item in cluster_codes if len(item) > 1] + name_id = 0 + for cl in cluster_to_combine: + # get index of cluster items to check if they are negative or positive correlated if they have been used + cluster_idx = [np.where(npi_codes_used == cl_code) + for cl_code in cl if np.where(npi_codes_used == cl_code)[0].size > 0] + for id_npi in cluster_idx: + if npis_corr[cluster_idx[0], id_npi] < 0: + # switch npis 0 -> 1 and 1 -> 0 + npi_to_switch = npi_codes_used[id_npi] + df_npis[npi_to_switch] -= 1 + df_npis[npi_to_switch] *= -1 + # copy values and delete item from dataframe + values = [df_npis[cl_code].values for cl_code in cl] + df_npis.drop(cl, axis=1, inplace=True) + # name clusters #TODO: how? For now cluster_0,1,2... + cluster_name = 'cluster_' + str(name_id) + # write cluster names in a dict (and to json) to reconstruct clusters after regression + cluster_dict[cluster_name] = cl + # TODO: how should the cluster values for the regression be calculated + cluster_value = cluster_function(np.array(values), axis=0) + df_npis[cluster_name] = cluster_value + name_id += 1 + # write npis with clusters + filename = 'clustered_npis_rki' + file_format = 'json' + gd.write_dataframe(df_npis, directory, filename, file_format) + # write cluster dict + filename = 'cluster_description_rki' + with open(directory+filename, 'w') as f: + print(cluster_dict, file=f) + + def main(): """! Main program entry.""" @@ -661,10 +737,11 @@ def main(): file_format = 'json' npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100', 'M01a_110', 'M01a_120'] - analyze_npi_data(True, True, fine_resolution, npis_final, - directory, file_format, False, True, np.mean) + + analyze_npi_data(read_data=True, make_plot=True, fine_resolution=fine_resolution, npis=npis_final, + directory=directory, file_format=file_format, npi_codes_considered=False, abs_correlation=True, cluster_function=np.mean, manual_rki_clustering=True) if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki_manual.json b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki_manual.json new file mode 100644 index 0000000000..f9c299e8fb --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki_manual.json @@ -0,0 +1,327 @@ +{ + "Contacts in private spaces": [ + [], + [ + "M01a_020" + ], + [ + "M01a_080", + "M01a_090", + "M01a_100" + ], + [ + "M01a_110", + "M01a_120" + ], + [ + "M01a_130", + "M01a_140", + "M01a_150" + ], + [] + ], + "Contacts in public spaces": [ + [], + [ + "M01b_020" + ], + [ + "M01b_080", + "M01b_090", + "M01b_100" + ], + [ + "M01b_110", + "M01b_120" + ], + [ + "M01b_130", + "M01b_140", + "M01b_150" + ], + [] + ], + "Further Education Schools": [ + [], + [ + "M02a_010" + ], + [ + "M02a_020" + ], + [ + "M02a_030", + "M02a_031", + "M02a_032", + "M02a_033", + "M02a_034", + "M02a_035", + "M02a_040" + ], + [], + [] + ], + "Primary Schools": [ + [], + [ + "M02b_010" + ], + [ + "M02b_020" + ], + [ + "M02b_030", + "M02b_031", + "M02b_032", + "M02b_033", + "M02b_034", + "M02b_036", + "M02b_035", + "M02b_040" + ], + [], + [] + ], + "Cultural and Educational Institutions": [ + [], + [ + "M06_020" + ], + [ + "M06_030", + "M06_031", + "M06_032", + "M06_033", + "M06_034" + ], + [ + "M06_040" + ], + [], + [] + ], + "Retail": [ + [], + [ + "M07_020" + ], + [ + "M07_030", + "M07_031", + "M07_032", + "M07_033", + "M07_034", + "M07_036" + ], + [ + "M07_040" + ], + [], + [] + ], + "Hospitality": [ + [], + [ + "M08_020" + ], + [ + "M08_030", + "M08_031", + "M08_032", + "M08_033" + ], + [ + "M08_040" + ], + [], + [] + ], + "Services and Crafts": [ + [], + [ + "M09_020" + ], + [ + "M09_030", + "M09_031", + "M09_032", + "M09_033" + ], + [ + "M09_040" + ], + [], + [] + ], + "Nightlife Establishments": [ + [], + [ + "M10_020" + ], + [ + "M10_030", + "M10_031", + "M10_032", + "M10_033" + ], + [ + "M10_040" + ], + [], + [] + ], + "Accommodation": [ + [], + [ + "M11_020" + ], + [ + "M11_030" + ], + [ + "M11_040" + ], + [], + [] + ], + "Indoor Sports": [ + [], + [ + "M12_020" + ], + [ + "M12_030" + ], + [ + "M12_040" + ], + [ + "M12_070" + ], + [] + ], + "Outdoor Sports": [ + [], + [ + "M13_020" + ], + [ + "M13_030" + ], + [ + "M13_040" + ], + [ + "M13_070" + ], + [] + ], + "Domestic Travel": [ + [], + [ + "M14_010" + ], + [ + "M14_020" + ], + [ + "M14_030" + ], + [], + [] + ], + "International Travel": [ + [], + [ + "M15_010" + ], + [ + "M15_020" + ], + [ + "M15_030" + ], + [], + [] + ], + "Masking": [ + [ + "M16_010", + "M16_020" + ], + [ + "M16_030", + "M16_040", + "M16_050", + "M16_060" + ], + [ + "M16_070", + "M16_080", + "M16_090" + ], + [], + [], + [ + "M16_100" + ] + ], + "Workplace": [ + [], + [ + "M17_010" + ], + [ + "M17_020" + ], + [ + "M17_030" + ], + [], + [] + ], + "Curfew": [ + [], + [ + "M18_020" + ], + [ + "M18_030" + ], + [ + "M18_040" + ], + [], + [] + ], + "Physical Distancing": [ + [ + "M20_010" + ], + [ + "M20_020" + ], + [], + [], + [], + [] + ], + "Testing": [ + [ + "M21_010" + ], + [ + "M21_020" + ], + [ + "M21_030" + ], + [ + "M21_040" + ], + [ + "M21_050" + ], + [] + ] +} \ No newline at end of file From 3ada860b3d34a27ebe9bc6b2e0d217834317c68c Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 27 May 2024 09:54:11 +0200 Subject: [PATCH 27/30] adjust clustering rki --- .../memilio/epidata/assessNPIEffects.py | 67 ++++++ .../memilio/epidata/npi_clustering_rki.txt | 192 ++---------------- 2 files changed, 89 insertions(+), 170 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 5c898d518e..b64c9198ff 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -18,6 +18,7 @@ # limitations under the License. ############################################################################# import os +import ast import matplotlib.pyplot as plt import matplotlib.cm as cm import matplotlib as mpl @@ -651,6 +652,71 @@ def write_clustered_npis(df_npis, cluster_function, cluster_codes, npis_corr, di with open(directory+filename, 'w') as f: print(cluster_dict, file=f) +def get_clustering_rki(directory, file_format): + # read in file + filename = 'npi_clustering_rki.txt' + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) + #read npis + df_npis = pd.read_csv(directory + 'germany_counties_npi_subcat' + ".csv") + npis = pd.read_json(os.path.join(directory, 'npis.json')) + npi_codes_considered = [ + x for x in npis[dd.EngEng['npiCode']] if len(x.split('_')) == 2] + df_merged = df_npis.iloc[:, :2] + for subcode in npi_codes_considered: + # extract columns which have the subcode as part of the column + # name and sum over all these subcodes + df_merged[subcode] = df_npis.filter( + regex=subcode).sum(axis=1) + #copy date and county data + df_out = df_npis.loc[:,['Date', 'ID_County']] + with open(path, 'r') as file: + lines = file.readlines() + #read lines=maincodes + for line in lines: + name = line.split(':')[0] + codes = ast.literal_eval(line.split(':')[1]) + ind = 1 + for level in codes: + # if level is empty + if not len(level): + ind+=1 + continue + else: + cluster_name = name + '; Level ' + str(ind) + df_out[cluster_name] = df_merged.loc[:, [column for column in df_merged.columns if column in level]].sum(axis=1).values + ind+=1 + # merge maincodes as in rki + # Schools + for i in range(1,5): #=1,2,3,4 + df_out['Schools; Level '+str(i)] = df_out["Primary Schools; Level "+str(i)] + # ps level 5 and not fes level 5 + df_out['Schools; Level 5'] = df_out["Primary Schools; Level 5"] + df_out['Schools; Level 5']*=(df_out["Further Education Schools; Level 5"].replace([1,0],[0,1])) + # ps level 5 and fes level 5 + df_out['Schools; Level 6'] = df_out["Primary Schools; Level 5"] + df_out['Schools; Level 6']*=(df_out["Further Education Schools; Level 5"]) + # Public Events + for i in range(2,7): #=2,3,4,5,6 + df_out['Public Events; Level '+str(i)] = df_out['Public Events Indoor; Level '+str(i)] + # Sports + for i in range(2,4): #=2,3 + df_out['Sports; Level '+str(i)] = df_out['Sports Indoor; Level '+str(i)] + # indoor level 4 and not outdoor level 5 + df_out['Sports; Level 4'] = df_out["Sports Indoor; Level 4"] + df_out['Sports; Level 4']*=(df_out["Sports Outdoor; Level 5"].replace([1,0],[0,1])) + # indoor level 4 and outdoor level 5 + df_out['Sports; Level 5'] = df_out["Sports Indoor; Level 4"] + df_out['Sports; Level 5']*=(df_out["Sports Outdoor; Level 5"]) + # drop old columns + for drop_name in ['Public Events Indoor', 'Public Events Outdoor', "Primary Schools", "Further Education Schools", "Sports Indoor", "Sports Outdoor"]: + for i in range(1,7): + try: + df_out.drop(drop_name + '; Level ' + str(i), inplace = True, axis=1) + except KeyError: + pass + return df_out + + def main(): """! Main program entry.""" @@ -659,6 +725,7 @@ def main(): npis_final = [] directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') file_format = 'json' + get_clustering_rki(directory, file_format) npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100', 'M01a_110', 'M01a_120'] analyze_npi_data(True, True, fine_resolution, npis_final, diff --git a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt index b032abc5c9..62704cd00e 100644 --- a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt +++ b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt @@ -1,170 +1,22 @@ -Contacts in private spaces: [ - [], - ['M01a_020'], - ['M01a_080', 'M01a_090', 'M01a_100'], - ['M01a_110', 'M01a_120'], - ['M01a_130', 'M01a_140', 'M01a_150'], - [], -] - -Contacts in public spaces: [ - [], - ['M01b_020'], - ['M01b_080', 'M01b_090', 'M01b_100'], - ['M01b_110', 'M01b_120'], - ['M01b_130', 'M01b_140', 'M01b_150'], - [], -] - -Further Education Schools: [ - [], - ['M02a_010'], - ['M02a_020'], - ['M02a_030', 'M02a_031', 'M02a_032', 'M02a_033', 'M02a_034', 'M02a_035', 'M02a_040'], - [], - [], -] - -Primary Schools: [ - [], - ['M02b_010'], - ['M02b_020'], - ['M02b_030', 'M02b_031', 'M02b_032', 'M02b_033', 'M02b_034', 'M02b_036', 'M02b_035', 'M02b_040'], - [], - [], -] - -Cultural and Educational Institutions: [ - [], - ['M06_020'], - ['M06_030', 'M06_031', 'M06_032', 'M06_033', 'M06_034'], - ['M06_040'], - [], - [], -] - -Retail: [ - [], - ['M07_020'], - ['M07_030', 'M07_031', 'M07_032', 'M07_033', 'M07_034', 'M07_036'], - ['M07_040'], - [], - [], -] - -Hospitality: [ - [], - ['M08_020'], - ['M08_030', 'M08_031', 'M08_032', 'M08_033'], - ['M08_040'], - [], - [], -] - -Services and Crafts: [ - [], - ['M09_020'], - ['M09_030', 'M09_031', 'M09_032', 'M09_033'], - ['M09_040'], - [], - [], -] - -Nightlife Establishments: [ - [], - ['M10_020'], - ['M10_030', 'M10_031', 'M10_032', 'M10_033'], - ['M10_040'], - [], - [], -] - -Accommodation: [ - [], - ['M11_020'], - ['M11_030'], - ['M11_040'], - [], - [], -] - -Indoor Sports: [ - [], - ['M12_020'], - ['M12_030'], - ['M12_040'], - ['M12_070'], - [], -] - -Outdoor Sports: [ - [], - ['M13_020'], - ['M13_030'], - ['M13_040'], - ['M13_070'], - [], -] - -Domestic Travel: [ - [], - ['M14_010'], - ['M14_020'], - ['M14_030'], - [], - [], -] - -International Travel: [ - [], - ['M15_010'], - ['M15_020'], - ['M15_030'], - [], - [], -] - -Masking: [ - ['M16_010', 'M16_020'], - ['M16_030', 'M16_040', 'M16_050', 'M16_060'], - ['M16_070', 'M16_080', 'M16_090'], - [], - [], - ['M16_100'], -] - -Workplace: [ - [], - ['M17_010'], - ['M17_020'], - ['M17_030'], - [], - [], -] - -Curfew: [ - [], - ['M18_020'], - ['M18_030'], - ['M18_040'], - [], - [], -] - -Physical Distancing: [ - ['M20_010'], - ['M20_020'], - [], - [], - [], - [], -] - -Testing: [ - ['M21_010'], - ['M21_020'], - ['M21_030'], - ['M21_040'], - ['M21_050'], - [], -] +Contacts in private spaces: [ [], ['M01a_020'], ['M01a_080', 'M01a_090', 'M01a_100'], ['M01a_110', 'M01a_120'], ['M01a_130', 'M01a_140', 'M01a_150'], [],] +Contacts in public spaces: [ [], ['M01b_020'], ['M01b_080', 'M01b_090', 'M01b_100'], ['M01b_110', 'M01b_120'], ['M01b_130', 'M01b_140', 'M01b_150'], [],] +Further Education Schools: [ ['M02a_010'], ['M02a_020'], ['M02a_030', 'M02a_031', 'M02a_032'], ['M02a_033', 'M02a_034', 'M02a_036'], ['M02a_035', 'M02a_040'], [],] +Primary Schools: [ ['M02b_010'], ['M02b_020'], ['M02b_030', 'M02b_031', 'M02b_032'], ['M02b_033', 'M02b_034', 'M02b_036'], ['M02b_035', 'M02b_040'], [],] +Day Care: [ ['M03_010'],['M03_020'],['M03_030'],['M03_040','M03_050'], [], ['M03_060']] +Public Events Indoor: [[],['M04_050','M04_060'],['M04_070','M04_080'],['M04_090'],['M04_100','M04_110','M04_120','M04_130'],['M04_140'],] +Public Events Outdoor: [[],['M05_050','M05_060'],['M05_070','M05_080'],['M05_090','M05_100','M05_110','M05_120'],['M05_130','M05_140','M05_150'],['M05_160'],] +Cultural and Educational Institutions: [ [], ['M06_020'], ['M06_030', 'M06_031'], ['M06_032', 'M06_033', 'M06_034'], ['M06_040'], [],] +Retail: [ [], ['M07_020'], ['M07_030'], ['M07_033', 'M07_034', 'M07_036'], ['M07_032', 'M07_031', 'M07_040'], [],] +Gastronomy: [ [], ['M08_020'], ['M08_030', 'M08_031'], ['M08_032'], ['M08_033', 'M08_040'], [],] +Services and Crafts: [ [], ['M09_020'], ['M09_030', 'M09_031'], ['M09_032'], ['M09_033', 'M09_040'], [],] +Nightlife Establishments: [ [], ['M10_020', 'M10_030', 'M10_031', 'M10_032', 'M10_033'], ['M10_040'], [], [], [],] +Accommodation: [ [], ['M11_020'], ['M11_030', 'M11_040'], [], [], [],] +Indoor Sports: [ [], ['M12_020', 'M12_030'], ['M12_040'], ['M12_070'], [], [],] +Outdoor Sports: [ [], ['M13_020'], ['M13_030'], ['M13_040'], ['M13_070'],[],] +Domestic Travel: [ ['M14_010'], ['M14_020'], ['M14_030'],[], [], [],] +International Travel: [ ['M15_010'], ['M15_020'], ['M15_030'], [], [], [],] +Masking: [ ['M16_010', 'M16_020'], ['M16_030', 'M16_040'], ['M16_050'], ['M16_060', 'M16_070', 'M16_080', 'M16_090'], ['M16_100'], [],] +Workplace: [ ['M17_010'], ['M17_020', 'M17_030'], [], [], [], [],] +Curfew: [ [], ['M18_020', 'M18_030', 'M18_040'], [], [], [], [],] +Physical Distancing: [ ['M20_010'], ['M20_020'], [], [], [], [],] +Testing: [ ['M21_010'], ['M21_020', 'M21_030'], ['M21_040', 'M21_080'], ['M21_050','M21_060','M21_070'], [], [],] From 4df063f369d372e5107133ea586c79abad65ae3e Mon Sep 17 00:00:00 2001 From: patricklnz Date: Mon, 27 May 2024 11:02:31 +0200 Subject: [PATCH 28/30] add clustering of SKBGDH --- .../memilio/epidata/assessNPIEffects.py | 49 ++++++++++++++++--- 1 file changed, 42 insertions(+), 7 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index d98991046b..6a78d8d164 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -727,20 +727,23 @@ def get_clustering_rki(directory, file_format): df_out['Public Events; Level '+str(i)] = df_out['Public Events Indoor; Level '+str(i)] # Sports for i in range(2,4): #=2,3 - df_out['Sports; Level '+str(i)] = df_out['Sports Indoor; Level '+str(i)] + df_out['Sports; Level '+str(i)] = df_out['Indoor Sports; Level '+str(i)] # indoor level 4 and not outdoor level 5 - df_out['Sports; Level 4'] = df_out["Sports Indoor; Level 4"] - df_out['Sports; Level 4']*=(df_out["Sports Outdoor; Level 5"].replace([1,0],[0,1])) + df_out['Sports; Level 4'] = df_out["Indoor Sports; Level 4"] + df_out['Sports; Level 4']*=(df_out["Outdoor Sports; Level 5"].replace([1,0],[0,1])) # indoor level 4 and outdoor level 5 - df_out['Sports; Level 5'] = df_out["Sports Indoor; Level 4"] - df_out['Sports; Level 5']*=(df_out["Sports Outdoor; Level 5"]) + df_out['Sports; Level 5'] = df_out["Indoor Sports; Level 4"] + df_out['Sports; Level 5']*=(df_out["Outdoor Sports; Level 5"]) # drop old columns - for drop_name in ['Public Events Indoor', 'Public Events Outdoor', "Primary Schools", "Further Education Schools", "Sports Indoor", "Sports Outdoor"]: + for drop_name in ['Public Events Indoor', 'Public Events Outdoor', "Primary Schools", "Further Education Schools", "Indoor Sports", "Outdoor Sports"]: for i in range(1,7): try: df_out.drop(drop_name + '; Level ' + str(i), inplace = True, axis=1) except KeyError: pass + df_out = merge_cluster_rki(df_out, 'SKBG') + filename = 'cluster_rki' + gd.write_dataframe(df_out, directory, filename, 'json') return df_out @@ -793,6 +796,38 @@ def write_cluster_rki(df_npis, cluster_function, npis_corr, directory, npi_code with open(directory+filename, 'w') as f: print(cluster_dict, file=f) +def merge_cluster_rki(df_rki, which): + merge_list = [] + # define column names which have to be merged + if 'S' in which: + merge_list.append('Sports') + if 'K' in which: + merge_list.append('Cultural and Educational Institutions') + if 'B' in which: + merge_list.append('Accommodation') + if 'G' in which: + merge_list.append('Gastronomy') + if ('H' in which) or ('E' in which): + merge_list.append('Retail') + if 'D' in which: + merge_list.append('Services and Crafts') + column_names = [df_rki.filter(regex=name).columns.tolist() for name in merge_list] + # put values in new df + df_merge = pd.DataFrame() + df_max_stage = df_rki.loc[:, [code for m in column_names for code in m if code.endswith('5')]] + i = 0 + for main_c in column_names: + df_merge[str(i)] = df_rki.loc[:, main_c].max(axis=1) + i+=1 + # delete old columns + df_rki.drop(main_c, axis=1, inplace=True) + # merge if condition (max 1,2,3...) applies + for stage in range(1,len(which)+1): + df_rki[str(stage)+ ' of ' + which + ' max'] = (df_merge.sum(axis=1)<=stage).astype(int) + df_rki[str(stage)+ ' of '+ which + ' on second highest level'] = (df_max_stage.sum(axis=1)==stage).astype(int) + return df_rki + + def main(): """! Main program entry.""" @@ -801,7 +836,7 @@ def main(): npis_final = [] directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') file_format = 'json' - get_clustering_rki(directory, file_format) + df_rki = get_clustering_rki(directory, file_format) npi_codes_considered = ['M01a_010', 'M01a_020', 'M01a_100', 'M01a_110', 'M01a_120'] From c2fab29a0020f2705461401439a8d86e696054f4 Mon Sep 17 00:00:00 2001 From: Anna Wendler <106674756+annawendler@users.noreply.github.com> Date: Tue, 28 May 2024 08:38:17 +0200 Subject: [PATCH 29/30] remove spaces from txt file and uncomment second rki cluster function --- .../memilio/epidata/assessNPIEffects.py | 171 ++++++++++-------- .../memilio/epidata/npi_clustering_rki.txt | 44 ++--- 2 files changed, 113 insertions(+), 102 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 6a78d8d164..41a8a0a9ef 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -450,10 +450,10 @@ def analyze_npi_data( npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) - if manual_rki_clustering: - write_cluster_rki(df_npis, cluster_function, - npis_corr, directory, npi_codes_used) - return + # if manual_rki_clustering: + # write_cluster_rki(df_npis, cluster_function, + # npis_corr, directory, npi_codes_used) + # return # compute hierarchical clustering (via best-suited method) compare_methods = False @@ -679,11 +679,12 @@ def write_clustered_npis(df_npis, cluster_function, cluster_codes, npis_corr, di with open(directory+filename, 'w') as f: print(cluster_dict, file=f) + def get_clustering_rki(directory, file_format): # read in file filename = 'npi_clustering_rki.txt' path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) - #read npis + # read npis df_npis = pd.read_csv(directory + 'germany_counties_npi_subcat' + ".csv") npis = pd.read_json(os.path.join(directory, 'npis.json')) npi_codes_considered = [ @@ -694,11 +695,11 @@ def get_clustering_rki(directory, file_format): # name and sum over all these subcodes df_merged[subcode] = df_npis.filter( regex=subcode).sum(axis=1) - #copy date and county data - df_out = df_npis.loc[:,['Date', 'ID_County']] - with open(path, 'r') as file: + # copy date and county data + df_out = df_npis.loc[:, ['Date', 'ID_County']] + with open(path) as file: lines = file.readlines() - #read lines=maincodes + # read lines=maincodes for line in lines: name = line.split(':')[0] codes = ast.literal_eval(line.split(':')[1]) @@ -706,39 +707,47 @@ def get_clustering_rki(directory, file_format): for level in codes: # if level is empty if not len(level): - ind+=1 + ind += 1 continue else: cluster_name = name + '; Level ' + str(ind) - df_out[cluster_name] = df_merged.loc[:, [column for column in df_merged.columns if column in level]].sum(axis=1).values - ind+=1 + df_out[cluster_name] = df_merged.loc[:, [ + column for column in df_merged.columns if column in level]].sum(axis=1).values + ind += 1 # merge maincodes as in rki # Schools - for i in range(1,5): #=1,2,3,4 - df_out['Schools; Level '+str(i)] = df_out["Primary Schools; Level "+str(i)] + for i in range(1, 5): # =1,2,3,4 + df_out['Schools; Level ' + + str(i)] = df_out["Primary Schools; Level "+str(i)] # ps level 5 and not fes level 5 df_out['Schools; Level 5'] = df_out["Primary Schools; Level 5"] - df_out['Schools; Level 5']*=(df_out["Further Education Schools; Level 5"].replace([1,0],[0,1])) + df_out['Schools; Level 5'] *= ( + df_out["Further Education Schools; Level 5"].replace([1, 0], [0, 1])) # ps level 5 and fes level 5 df_out['Schools; Level 6'] = df_out["Primary Schools; Level 5"] - df_out['Schools; Level 6']*=(df_out["Further Education Schools; Level 5"]) + df_out['Schools; Level 6'] *= ( + df_out["Further Education Schools; Level 5"]) # Public Events - for i in range(2,7): #=2,3,4,5,6 - df_out['Public Events; Level '+str(i)] = df_out['Public Events Indoor; Level '+str(i)] + for i in range(2, 7): # =2,3,4,5,6 + df_out['Public Events; Level ' + + str(i)] = df_out['Public Events Indoor; Level '+str(i)] # Sports - for i in range(2,4): #=2,3 - df_out['Sports; Level '+str(i)] = df_out['Indoor Sports; Level '+str(i)] + for i in range(2, 4): # =2,3 + df_out['Sports; Level ' + + str(i)] = df_out['Indoor Sports; Level '+str(i)] # indoor level 4 and not outdoor level 5 df_out['Sports; Level 4'] = df_out["Indoor Sports; Level 4"] - df_out['Sports; Level 4']*=(df_out["Outdoor Sports; Level 5"].replace([1,0],[0,1])) + df_out['Sports; Level 4'] *= ( + df_out["Outdoor Sports; Level 5"].replace([1, 0], [0, 1])) # indoor level 4 and outdoor level 5 df_out['Sports; Level 5'] = df_out["Indoor Sports; Level 4"] - df_out['Sports; Level 5']*=(df_out["Outdoor Sports; Level 5"]) + df_out['Sports; Level 5'] *= (df_out["Outdoor Sports; Level 5"]) # drop old columns for drop_name in ['Public Events Indoor', 'Public Events Outdoor', "Primary Schools", "Further Education Schools", "Indoor Sports", "Outdoor Sports"]: - for i in range(1,7): + for i in range(1, 7): try: - df_out.drop(drop_name + '; Level ' + str(i), inplace = True, axis=1) + df_out.drop(drop_name + '; Level ' + + str(i), inplace=True, axis=1) except KeyError: pass df_out = merge_cluster_rki(df_out, 'SKBG') @@ -747,54 +756,53 @@ def get_clustering_rki(directory, file_format): return df_out - -def write_cluster_rki(df_npis, cluster_function, npis_corr, directory, npi_codes_used): - cluster_dict = dict() - # only compute cluster with more than two items - # Read the JSON file - with open(os.path.join(directory, 'npi_clustering_rki_manual.json')) as json_file: - data = json.load(json_file) - - # Convert JSON data to a DataFrame - df_rki_clustering = pd.DataFrame.from_dict( - data, orient='index').transpose() - - cluster_codes = [list(item) for sublist in df_rki_clustering.values.tolist() - for item in sublist] - # remove empty lists from cluster_codes - cluster_codes = [x for x in cluster_codes if x] - - cluster_to_combine = [item for item in cluster_codes if len(item) > 1] - name_id = 0 - for cl in cluster_to_combine: - # get index of cluster items to check if they are negative or positive correlated if they have been used - cluster_idx = [np.where(npi_codes_used == cl_code) - for cl_code in cl if np.where(npi_codes_used == cl_code)[0].size > 0] - for id_npi in cluster_idx: - if npis_corr[cluster_idx[0], id_npi] < 0: - # switch npis 0 -> 1 and 1 -> 0 - npi_to_switch = npi_codes_used[id_npi] - df_npis[npi_to_switch] -= 1 - df_npis[npi_to_switch] *= -1 - # copy values and delete item from dataframe - values = [df_npis[cl_code].values for cl_code in cl] - df_npis.drop(cl, axis=1, inplace=True) - # name clusters #TODO: how? For now cluster_0,1,2... - cluster_name = 'cluster_' + str(name_id) - # write cluster names in a dict (and to json) to reconstruct clusters after regression - cluster_dict[cluster_name] = cl - # TODO: how should the cluster values for the regression be calculated - cluster_value = cluster_function(np.array(values), axis=0) - df_npis[cluster_name] = cluster_value - name_id += 1 - # write npis with clusters - filename = 'clustered_npis_rki' - file_format = 'json' - gd.write_dataframe(df_npis, directory, filename, file_format) - # write cluster dict - filename = 'cluster_description_rki' - with open(directory+filename, 'w') as f: - print(cluster_dict, file=f) +# def write_cluster_rki(df_npis, cluster_function, npis_corr, directory, npi_codes_used): +# cluster_dict = dict() +# # only compute cluster with more than two items +# # Read the JSON file +# with open(os.path.join(directory, 'npi_clustering_rki_manual.json')) as json_file: +# data = json.load(json_file) + +# # Convert JSON data to a DataFrame +# df_rki_clustering = pd.DataFrame.from_dict( +# data, orient='index').transpose() + +# cluster_codes = [list(item) for sublist in df_rki_clustering.values.tolist() +# for item in sublist] +# # remove empty lists from cluster_codes +# cluster_codes = [x for x in cluster_codes if x] + +# cluster_to_combine = [item for item in cluster_codes if len(item) > 1] +# name_id = 0 +# for cl in cluster_to_combine: +# # get index of cluster items to check if they are negative or positive correlated if they have been used +# cluster_idx = [np.where(npi_codes_used == cl_code) +# for cl_code in cl if np.where(npi_codes_used == cl_code)[0].size > 0] +# for id_npi in cluster_idx: +# if npis_corr[cluster_idx[0], id_npi] < 0: +# # switch npis 0 -> 1 and 1 -> 0 +# npi_to_switch = npi_codes_used[id_npi] +# df_npis[npi_to_switch] -= 1 +# df_npis[npi_to_switch] *= -1 +# # copy values and delete item from dataframe +# values = [df_npis[cl_code].values for cl_code in cl] +# df_npis.drop(cl, axis=1, inplace=True) +# # name clusters #TODO: how? For now cluster_0,1,2... +# cluster_name = 'cluster_' + str(name_id) +# # write cluster names in a dict (and to json) to reconstruct clusters after regression +# cluster_dict[cluster_name] = cl +# # TODO: how should the cluster values for the regression be calculated +# cluster_value = cluster_function(np.array(values), axis=0) +# df_npis[cluster_name] = cluster_value +# name_id += 1 +# # write npis with clusters +# filename = 'clustered_npis_rki' +# file_format = 'json' +# gd.write_dataframe(df_npis, directory, filename, file_format) +# # write cluster dict +# filename = 'cluster_description_rki' +# with open(directory+filename, 'w') as f: +# print(cluster_dict, file=f) def merge_cluster_rki(df_rki, which): merge_list = [] @@ -811,24 +819,27 @@ def merge_cluster_rki(df_rki, which): merge_list.append('Retail') if 'D' in which: merge_list.append('Services and Crafts') - column_names = [df_rki.filter(regex=name).columns.tolist() for name in merge_list] + column_names = [df_rki.filter(regex=name).columns.tolist() + for name in merge_list] # put values in new df df_merge = pd.DataFrame() - df_max_stage = df_rki.loc[:, [code for m in column_names for code in m if code.endswith('5')]] + df_max_stage = df_rki.loc[:, [ + code for m in column_names for code in m if code.endswith('5')]] i = 0 for main_c in column_names: df_merge[str(i)] = df_rki.loc[:, main_c].max(axis=1) - i+=1 + i += 1 # delete old columns df_rki.drop(main_c, axis=1, inplace=True) # merge if condition (max 1,2,3...) applies - for stage in range(1,len(which)+1): - df_rki[str(stage)+ ' of ' + which + ' max'] = (df_merge.sum(axis=1)<=stage).astype(int) - df_rki[str(stage)+ ' of '+ which + ' on second highest level'] = (df_max_stage.sum(axis=1)==stage).astype(int) + for stage in range(1, len(which)+1): + df_rki[str(stage) + ' of ' + which + + ' max'] = (df_merge.sum(axis=1) <= stage).astype(int) + df_rki[str(stage) + ' of ' + which + ' on second highest level'] = ( + df_max_stage.sum(axis=1) == stage).astype(int) return df_rki - def main(): """! Main program entry.""" @@ -841,7 +852,7 @@ def main(): 'M01a_100', 'M01a_110', 'M01a_120'] analyze_npi_data(read_data=True, make_plot=True, fine_resolution=fine_resolution, npis=npis_final, - directory=directory, file_format=file_format, npi_codes_considered=False, abs_correlation=True, cluster_function=np.mean, manual_rki_clustering=True) + directory=directory, file_format=file_format, npi_codes_considered=False, abs_correlation=True, cluster_function=np.mean, manual_rki_clustering=False) if __name__ == "__main__": diff --git a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt index 62704cd00e..63060d2634 100644 --- a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt +++ b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt @@ -1,22 +1,22 @@ -Contacts in private spaces: [ [], ['M01a_020'], ['M01a_080', 'M01a_090', 'M01a_100'], ['M01a_110', 'M01a_120'], ['M01a_130', 'M01a_140', 'M01a_150'], [],] -Contacts in public spaces: [ [], ['M01b_020'], ['M01b_080', 'M01b_090', 'M01b_100'], ['M01b_110', 'M01b_120'], ['M01b_130', 'M01b_140', 'M01b_150'], [],] -Further Education Schools: [ ['M02a_010'], ['M02a_020'], ['M02a_030', 'M02a_031', 'M02a_032'], ['M02a_033', 'M02a_034', 'M02a_036'], ['M02a_035', 'M02a_040'], [],] -Primary Schools: [ ['M02b_010'], ['M02b_020'], ['M02b_030', 'M02b_031', 'M02b_032'], ['M02b_033', 'M02b_034', 'M02b_036'], ['M02b_035', 'M02b_040'], [],] -Day Care: [ ['M03_010'],['M03_020'],['M03_030'],['M03_040','M03_050'], [], ['M03_060']] -Public Events Indoor: [[],['M04_050','M04_060'],['M04_070','M04_080'],['M04_090'],['M04_100','M04_110','M04_120','M04_130'],['M04_140'],] -Public Events Outdoor: [[],['M05_050','M05_060'],['M05_070','M05_080'],['M05_090','M05_100','M05_110','M05_120'],['M05_130','M05_140','M05_150'],['M05_160'],] -Cultural and Educational Institutions: [ [], ['M06_020'], ['M06_030', 'M06_031'], ['M06_032', 'M06_033', 'M06_034'], ['M06_040'], [],] -Retail: [ [], ['M07_020'], ['M07_030'], ['M07_033', 'M07_034', 'M07_036'], ['M07_032', 'M07_031', 'M07_040'], [],] -Gastronomy: [ [], ['M08_020'], ['M08_030', 'M08_031'], ['M08_032'], ['M08_033', 'M08_040'], [],] -Services and Crafts: [ [], ['M09_020'], ['M09_030', 'M09_031'], ['M09_032'], ['M09_033', 'M09_040'], [],] -Nightlife Establishments: [ [], ['M10_020', 'M10_030', 'M10_031', 'M10_032', 'M10_033'], ['M10_040'], [], [], [],] -Accommodation: [ [], ['M11_020'], ['M11_030', 'M11_040'], [], [], [],] -Indoor Sports: [ [], ['M12_020', 'M12_030'], ['M12_040'], ['M12_070'], [], [],] -Outdoor Sports: [ [], ['M13_020'], ['M13_030'], ['M13_040'], ['M13_070'],[],] -Domestic Travel: [ ['M14_010'], ['M14_020'], ['M14_030'],[], [], [],] -International Travel: [ ['M15_010'], ['M15_020'], ['M15_030'], [], [], [],] -Masking: [ ['M16_010', 'M16_020'], ['M16_030', 'M16_040'], ['M16_050'], ['M16_060', 'M16_070', 'M16_080', 'M16_090'], ['M16_100'], [],] -Workplace: [ ['M17_010'], ['M17_020', 'M17_030'], [], [], [], [],] -Curfew: [ [], ['M18_020', 'M18_030', 'M18_040'], [], [], [], [],] -Physical Distancing: [ ['M20_010'], ['M20_020'], [], [], [], [],] -Testing: [ ['M21_010'], ['M21_020', 'M21_030'], ['M21_040', 'M21_080'], ['M21_050','M21_060','M21_070'], [], [],] +Contacts in private spaces:[ [], ['M01a_020'], ['M01a_080', 'M01a_090', 'M01a_100'], ['M01a_110', 'M01a_120'], ['M01a_130', 'M01a_140', 'M01a_150'], [],] +Contacts in public spaces:[ [], ['M01b_020'], ['M01b_080', 'M01b_090', 'M01b_100'], ['M01b_110', 'M01b_120'], ['M01b_130', 'M01b_140', 'M01b_150'], [],] +Further Education Schools:[ ['M02a_010'], ['M02a_020'], ['M02a_030', 'M02a_031', 'M02a_032'], ['M02a_033', 'M02a_034', 'M02a_036'], ['M02a_035', 'M02a_040'], [],] +Primary Schools:[ ['M02b_010'], ['M02b_020'], ['M02b_030', 'M02b_031', 'M02b_032'], ['M02b_033', 'M02b_034', 'M02b_036'], ['M02b_035', 'M02b_040'], [],] +Day Care:[ ['M03_010'],['M03_020'],['M03_030'],['M03_040','M03_050'], [], ['M03_060']] +Public Events Indoor:[[],['M04_050','M04_060'],['M04_070','M04_080'],['M04_090'],['M04_100','M04_110','M04_120','M04_130'],['M04_140'],] +Public Events Outdoor:[[],['M05_050','M05_060'],['M05_070','M05_080'],['M05_090','M05_100','M05_110','M05_120'],['M05_130','M05_140','M05_150'],['M05_160'],] +Cultural and Educational Institutions:[ [], ['M06_020'], ['M06_030', 'M06_031'], ['M06_032', 'M06_033', 'M06_034'], ['M06_040'], [],] +Retail:[ [], ['M07_020'], ['M07_030'], ['M07_033', 'M07_034', 'M07_036'], ['M07_032', 'M07_031', 'M07_040'], [],] +Gastronomy:[ [], ['M08_020'], ['M08_030', 'M08_031'], ['M08_032'], ['M08_033', 'M08_040'], [],] +Services and Crafts:[ [], ['M09_020'], ['M09_030', 'M09_031'], ['M09_032'], ['M09_033', 'M09_040'], [],] +Nightlife Establishments:[ [], ['M10_020', 'M10_030', 'M10_031', 'M10_032', 'M10_033'], ['M10_040'], [], [], [],] +Accommodation:[ [], ['M11_020'], ['M11_030', 'M11_040'], [], [], [],] +Indoor Sports:[ [], ['M12_020', 'M12_030'], ['M12_040'], ['M12_070'], [], [],] +Outdoor Sports:[ [], ['M13_020'], ['M13_030'], ['M13_040'], ['M13_070'],[],] +Domestic Travel:[ ['M14_010'], ['M14_020'], ['M14_030'],[], [], [],] +International Travel:[ ['M15_010'], ['M15_020'], ['M15_030'], [], [], [],] +Masking:[ ['M16_010', 'M16_020'], ['M16_030', 'M16_040'], ['M16_050'], ['M16_060', 'M16_070', 'M16_080', 'M16_090'], ['M16_100'], [],] +Workplace:[ ['M17_010'], ['M17_020', 'M17_030'], [], [], [], [],] +Curfew:[ [], ['M18_020', 'M18_030', 'M18_040'], [], [], [], [],] +Physical Distancing:[ ['M20_010'], ['M20_020'], [], [], [], [],] +Testing:[ ['M21_010'], ['M21_020', 'M21_030'], ['M21_040', 'M21_080'], ['M21_050','M21_060','M21_070'], [], [],] From 3eaf31ccfae1bc7497090e93d47b3d7b7ac17456 Mon Sep 17 00:00:00 2001 From: Anna Wendler <106674756+annawendler@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:59:53 +0200 Subject: [PATCH 30/30] no level 1 in clustering anymore --- .../memilio/epidata/assessNPIEffects.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py index 41a8a0a9ef..98f25825da 100644 --- a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -710,13 +710,17 @@ def get_clustering_rki(directory, file_format): ind += 1 continue else: - cluster_name = name + '; Level ' + str(ind) - df_out[cluster_name] = df_merged.loc[:, [ - column for column in df_merged.columns if column in level]].sum(axis=1).values - ind += 1 + # do not include clusters with level 1 (analagous to RKI) + if ind == 1: + ind += 1 + else: + cluster_name = name + '; Level ' + str(ind) + df_out[cluster_name] = df_merged.loc[:, [ + column for column in df_merged.columns if column in level]].sum(axis=1).values + ind += 1 # merge maincodes as in rki # Schools - for i in range(1, 5): # =1,2,3,4 + for i in range(2, 5): # =1,2,3,4 df_out['Schools; Level ' + str(i)] = df_out["Primary Schools; Level "+str(i)] # ps level 5 and not fes level 5 @@ -744,7 +748,7 @@ def get_clustering_rki(directory, file_format): df_out['Sports; Level 5'] *= (df_out["Outdoor Sports; Level 5"]) # drop old columns for drop_name in ['Public Events Indoor', 'Public Events Outdoor', "Primary Schools", "Further Education Schools", "Indoor Sports", "Outdoor Sports"]: - for i in range(1, 7): + for i in range(2, 7): try: df_out.drop(drop_name + '; Level ' + str(i), inplace=True, axis=1)