diff --git a/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py new file mode 100644 index 0000000000..98f25825da --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/assessNPIEffects.py @@ -0,0 +1,864 @@ +############################################################################# +# Copyright (C) 2020-2023 German Aerospace Center (DLR-SC) +# +# Authors: Martin J. Kuehn +# +# Contact: Martin J. Kuehn +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +############################################################################# +import os +import ast +import json +import matplotlib.pyplot as plt +import matplotlib.cm as cm +import matplotlib as mpl +import numpy as np +import pandas as pd +from memilio.epidata import customPlot +from memilio.epidata import defaultDict as dd +from memilio.epidata import getDataIntoPandasDataFrame as gd +from memilio.epidata import getNPIData as gnpi +from scipy.cluster import hierarchy +from scipy.spatial import distance +from sklearn.metrics import silhouette_samples, silhouette_score +from dataclasses import dataclass + +# activate CoW for more predictable behaviour of pandas DataFrames +pd.options.mode.copy_on_write = True + +mpl.use('Qtagg') + + +@dataclass +class scored_cluster: + sc1: float # correlation-based cluster + sc2: float # silhouette score + clustering: np.ndarray # index to clustering array + + +def evaluate_clustering(corr_mat, corr_mat_pairw_dist, idx_to_cluster_idx, indices_all): + """! Computes a score for a particular clustering based on the + correlation matrix. The score is computed as the percentage of 'higher' + or 'high' values (e.g., between 0.5 and 0.75 or 0.75 and 1) of the + correlation matrix that are to be found in the diagonal blocks of the + clustered correlation matrix vs these values in the offdiagonal blocks. + + @param corr_mat correlation matrix between the features / data set items + that were clustered. + @param corr_mat_pairw_dist Pairwise distance computed from correlation matrix. + @param idx_to_cluster_idx Mapping of data item to cluster index. + @param indices_all List of indices of all data items. + + @return Scores for the provided clustering. + """ + # store indices of clusters + idx_orig = np.array(range(len(idx_to_cluster_idx))) + clusters = [idx_orig[idx_to_cluster_idx == i] + for i in range(idx_to_cluster_idx.max()+1)] + # store remaining/perpendicular indices for all clusters + clusters_perp = [idx_orig[idx_to_cluster_idx != i] + for i in range(idx_to_cluster_idx.max()+1)] + + ssc = np.nan + # own score on diagonal and offdiagonal correlation values + # extract correlation values of block diagonals and offdiagonals separately + corr_diag = [] + corr_offdiag = [] + for ii in range(len(clusters)): + corr_diag = np.append(corr_diag, abs( + corr_mat[np.ix_(clusters[ii], clusters[ii])].flatten())) + corr_offdiag = np.append(corr_offdiag, abs( + corr_mat[np.ix_(clusters[ii], clusters_perp[ii])].flatten())) + + step_width = 0.02 + nb_steps = int(1/step_width) + corr_thresholds = np.linspace(0+step_width, 1, nb_steps) + # p_offdiag should increase fast to 1 if cluster is good + p_offdiag = np.array([len(np.where(corr_offdiag <= corr_thresholds[i])[0]) + for i in range(len(corr_thresholds))]) / len(corr_offdiag) + # p_diag should be 1 for most time steps and only decrease fast at the end if cluster is good + p_diag = np.array([len(np.where(corr_diag >= corr_thresholds[i])[0]) + for i in range(len(corr_thresholds))]) / len(corr_diag) + + corr_clustering_score = (p_diag.sum() + p_offdiag.sum()) / (2 * nb_steps) + + # silhouette score + # Check for clusterings where every sample is in their own cluster or where all samples are in one cluster. + # In that case we cannot use silhouette_samples because we have to satisfy cluster numbers to be in '2 to n_samples - 1 (inclusive)' + if (len(np.unique(idx_to_cluster_idx)) < len(idx_to_cluster_idx)) and (len(np.unique(idx_to_cluster_idx)) > 1): + + # TODO: Do we want to compute the silhouette score on the distance of npis_corr or abs(npis_corr)? + # Otherwise formulated: How do we work with negative correlations for the correlation distance? -1 = 1 ? + sample_silhouette_values = silhouette_samples( + corr_mat_pairw_dist, idx_to_cluster_idx, metric="precomputed") + ssc = silhouette_score(corr_mat_pairw_dist, + idx_to_cluster_idx, metric="precomputed") + + # TODO: Check this is really always true! + if ssc != np.sum(sample_silhouette_values)/len(sample_silhouette_values): + print('Error: Average not the same as computed score....') + + # return clusters and mean silhouette coefficient + return [corr_clustering_score, ssc] + + +def compute_hierarch_clustering(corr_pairwdist, + methods=['single', 'complete', 'average', + 'weighted', 'centroid', 'median', + 'ward']): + """! Computes a hierarchical clustering for a (list of) method(s) and + provides the maximum cophenetic distance(s) as well as a score for the + clustering (see @method evaluate_clustering(...)). + + @param corr_pairwdist Computed pairwise distance between the features / data + set items. + @param method method or list of methods to compute the hierarchical + clustering. + + @return (List of) hierarchical clustering(s), maximum cophenetic distance(s) + and scores of the hierarchical clustering. + """ + + # NOTE: if changing method, pay attention to linkage methods; + # 'centroid', 'median', and 'ward' are correctly defined only if + # Euclidean pairwise metric is used in distance matrix that we used as input. + + # Based on the distances, we compute an hierarchical clustering for + # different methods + max_coph_corr_coeff = -1 + scores = dict() + # allow single entry + if not isinstance(methods, list): + methods = [methods] + # iterate over list + for method in methods: + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=method) + # compute cophenetic correlation distance/coefficient and cophenetic distance matrix + coph_corr_coeff, coph_dist_mat = hierarchy.cophenet( + cluster_hierarch, corr_pairwdist) + scores[method] = coph_corr_coeff + if coph_corr_coeff > max_coph_corr_coeff: + max_coph_corr_coeff = coph_corr_coeff + max_method = method + max_coph_dist_mat = coph_dist_mat + + # recompute hierarchical clustering with maximal cophenetic correlation distance/coefficient + cluster_hierarch = hierarchy.linkage(corr_pairwdist, method=max_method) + + print( + "Cophenetic correlation distance for method " + max_method + ": " + + str(max_coph_corr_coeff)) + + return cluster_hierarch, max_coph_dist_mat + + +def flatten_hierarch_clustering(corr_mat, corr_mat_pairw_dist, cluster_hierarch, weights, method): + """! Flattens a hierarchical clustering for a (list of) maximum cophenetic + distance(s) in the flat clusters and evaluates the resulting clustering with + respect to the corresponding correlation matrix. + + @param corr_mat correlation matrix between the features / data set items + clustered hierarchically. + @param corr_mat_pairw_dist Pairwise distance computed from correlation matrix. + @param cluster_hierarch hierarchical clustering of given features / data + set items. + @param weights Cophenetic distance or list of cophenetic + distances to compute the flat clustering(s). + + @return flat clustering(s) according to the (list of) maximum distance(s). + """ + + # all indices in npis_corr from 0 to n-1 + npi_indices_all = set(range(corr_mat.shape[0])) + scored_clusterings = [] + # allow single entries + if not isinstance(weights, list): + weights = [weights] + + # iterate over weights + for weight in weights: + # use the given weight to flatten the dendrogram + npi_idx_to_cluster_idx = hierarchy.fcluster( + cluster_hierarch, weight, criterion='distance') + + # start clustering labels with zero + if npi_idx_to_cluster_idx.min() == 1: + npi_idx_to_cluster_idx -= 1 + + # evaluate clustering + scores = evaluate_clustering( + corr_mat, corr_mat_pairw_dist, npi_idx_to_cluster_idx, npi_indices_all) + + scored_clusterings.append(scored_cluster( + scores[0], scores[1], npi_idx_to_cluster_idx)) + + # TODO: get best clustering + pos_best_clustering_1 = np.nanargmax([s.sc1 for s in scored_clusterings]) + pos_best_clustering_2 = np.nanargmax([s.sc2 for s in scored_clusterings]) + pos_best_clustering_12 = np.nanargmax([s.sc1/scored_clusterings[pos_best_clustering_1].sc1 + + s.sc2/scored_clusterings[pos_best_clustering_1].sc2 for s in scored_clusterings]) + return scored_clusterings[pos_best_clustering_1].clustering + + +def silhouette(pairw_dist_mat, cluster_labels): + + # if only one clustering is given, surround it by a list to make loop work. + if (isinstance(cluster_labels, list) or isinstance(cluster_labels, np.ndarray)) and ((isinstance(cluster_labels[0], int)) or (isinstance(cluster_labels[0], np.intc))): + cluster_labels = [cluster_labels] + + plt.figure() + + for clustering in cluster_labels: + + # get number of clusters (naming starting with 0,1,2...) + n_clusters = clustering.max()+1 + + # Check for clusterings where every sample is in their own cluster or where all samples are in one cluster. + # In that case we cannot use silhouette_samples because we have to satisfy cluster numbers to be in '2 to n_samples - 1 (inclusive)' + if (len(np.unique(clustering)) < len(clustering)) and (len(clustering) == pairw_dist_mat.shape[0]) and \ + (len(np.unique(clustering)) > 1): + + sample_silhouette_values = silhouette_samples( + pairw_dist_mat, clustering, metric="precomputed") + + y_lower = 10 + for i in range(n_clusters): + ith_cluster_silhouette_values = sample_silhouette_values[clustering == i] + + ith_cluster_silhouette_values.sort() + + size_cluster_i = ith_cluster_silhouette_values.shape[0] + y_upper = y_lower + size_cluster_i + + color = cm.hsv(float(i) / n_clusters) + plt.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + ith_cluster_silhouette_values, + facecolor=color, + edgecolor=color, + alpha=0.7, + ) + + plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) + + y_lower = y_upper + 10 + + plt.xlabel("The silhouette coefficient values") + plt.ylabel("Cluster label") + + plt.axvline(x=0, c='black', alpha=0.3) + plt.axvline(x=0.25, c='black', alpha=0.3) + plt.axvline(x=0.5, c='black', alpha=0.3) + plt.axvline(x=0.75, c='black', alpha=0.3) + + plt.yticks([]) # Clear the yaxis labels / ticks + plt.xticks([-0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) + + plt.title( + "Silhouette analysis for" + + " clustering with n_clusters = " + str(n_clusters), + ) + plt.tight_layout() + plt.savefig('figures/silhouette_plot.png') + # plt.show() + + else: + print("Dimension mismatch.") + + return sample_silhouette_values + + +def analyze_npi_data( + read_data, make_plot, fine_resolution, npis, directory, file_format, + npi_codes_considered, abs_correlation, cluster_function, manual_rki_clustering=False): + + # TODO: Error in custom plot list + make_plot = False + + if not read_data: + # return df_npis, npis or remove else from below + gnpi.get_npi_data(fine_resolution=2, + file_format=dd.defaultDict['file_format'], + out_folder=dd.defaultDict['out_folder'], + start_date=dd.defaultDict['start_date'], + end_date=dd.defaultDict['end_date'], + make_plot=dd.defaultDict['make_plot'], + ) + + else: # read formatted file + npis = pd.read_excel( + os.path.join( + directory, 'datensatzbeschreibung_massnahmen.xlsx'), + sheet_name=3, engine='openpyxl') + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_incgrouped' + else: + filename = 'germany_counties_npi_subcat' + else: + filename = 'germany_counties_npi_maincat' + df_npis = pd.read_csv(directory + filename + ".csv") + try: + df_npis = df_npis.drop('Unnamed: 0', axis=1) + except KeyError: + pass + # df_npis = mdfs.extract_subframe_based_on_dates(df_npis, date(2021,1,1), date(2021,6,1)) + # TODO: Check if we need the next line + npis = pd.read_json(os.path.join(directory, 'npis.json')) + # get code levels (main/subcodes) and position of main codes + + # check if any other integer than 0: not implemented or 1: implemented is + # used (maybe to specify the kind of implementation) + + if not npi_codes_considered: + npi_codes_considered = [ + x for x in npis[dd.EngEng['npiCode']] if len(x.split('_')) == 2] + df_merged = df_npis.iloc[:, :2] + for subcode in npi_codes_considered: + # extract columns which have the subcode as part of the column + # name and sum over all these subcodes + df_merged[subcode] = df_npis.filter( + regex=subcode).sum(axis=1) + + # make sure that only considered subcodes are in npis dataframe + npis = npis[npis[dd.EngEng['npiCode']].isin(npi_codes_considered)] + + # use copy on write of df_merged + df_npis = df_merged[:] + + if ~df_npis[npi_codes_considered].isin([0, 1]).any().any(): + + print("Info: Please ensure that NPI information is only boolean.") + + else: + # sum over different NPIs and plot share of counties implementing + # these NPIs versus counties without corresponding actions + df_npis_aggregated = df_npis.groupby( + dd.EngEng['date']).agg( + {i: sum for i in npi_codes_considered}).copy() + npis_total_sum = df_npis_aggregated.sum() + + npi_codes_empty = list(np.array(npi_codes_considered)[ + np.where(npis_total_sum == 0)[0]]) + + npi_unused_indices_all = [] + npi_used_indices_all = [] + npi_unused_indices = [] + npi_used_indices = [] + for i in range(len(npi_codes_considered)): + if npi_codes_considered[i] in npi_codes_empty: + npi_unused_indices.append(i) + npi_unused_indices_all.append( + np.where(npis[dd.EngEng['npiCode']] == npi_codes_considered[i])[0][0]) + else: + npi_used_indices.append(i) + npi_used_indices_all.append( + np.where(npis[dd.EngEng['npiCode']] == npi_codes_considered[i])[0][0]) + + npis_unused = np.array(npis['Description'])[ + npi_unused_indices_all] # for all fr1 codes: len=12 + npis_used = np.array(npis['Description'])[ + npi_used_indices_all] # for all fr1 codes: len=155 + npi_codes_used = list(np.array(npi_codes_considered)[npi_used_indices]) + npi_codes_unused = list( + np.array(npi_codes_considered)[npi_unused_indices]) + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'unused_subcats_incgrouped.txt' + else: + filename = 'unused_subcats.txt' + else: + filename = 'unused_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(npis_unused)): + file_npi.write(npi_codes_unused[i] + ": " + npis_unused[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + # open file to write used categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'used_subcats_incgrouped.txt' + else: + filename = 'used_subcats.txt' + else: + filename = 'used_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing used NPIs + for i in range(len(npis_used)): + file_npi.write(npi_codes_used[i] + ": " + npis_used[i]) + file_npi.write("\n") + # Closing file + file_npi.close() + + df_npis_used = df_npis[[dd.EngEng['date'], + dd.EngEng['idCounty']] + npi_codes_used].copy() + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'germany_counties_npi_subcat_used_incgrouped' + else: + filename = 'germany_counties_npi_subcat_used' + else: + filename = 'germany_counties_npi_maincat_used' + gd.write_dataframe(df_npis_used, directory, filename, file_format) + + # compute correlations + npis_corr = df_npis_used.iloc[:, 2:].corr().values + npis_corr_abs = abs(npis_corr) + # define which correlation matrix should be used based on input variable + # also adjust colormaps + if abs_correlation: + npis_corr_mat = npis_corr_abs + vmin = 0 + cmap = 'gray_r' + else: + npis_corr_mat = npis_corr + vmin = -1 + cmap = 'RdGy' + # plot log-colored correlations + plt.imshow(npis_corr_mat, cmap=cmap, vmin=vmin, vmax=1) + plt.colorbar() + # plot histogram + plt.figure() + plt.hist(npis_corr_mat.flatten(), bins=50) + plt.title("Correlation histogram", fontsize=18) + plt.xlabel("Correlation", fontsize=12) + plt.ylabel("Number of values", fontsize=12) + + # We understand the rows of npis_corr, the correlations of one NPI + # to the others as one node in the #NPIs-used-dimensional space. + # We compute the pairwise distances of these nodes. Then, nodes with + # similar correlations towards all other nodes exhibit small distances + corr_pairwdist = distance.pdist( + npis_corr_mat, metric='euclidean') + + npi_codes_used = np.asarray(df_npis_used.iloc[:, 2:].columns) + + # if manual_rki_clustering: + # write_cluster_rki(df_npis, cluster_function, + # npis_corr, directory, npi_codes_used) + # return + + # compute hierarchical clustering (via best-suited method) + compare_methods = False + if compare_methods: + # TODO: criterion to select from methods (output scores from flatten_...?!) + # centroid + method = 'centroid' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + # plt.show() + max_coph_dist = coph_dist_mat.max() + # TODO: Discuss why npis_corr is used as input and not corr_pairwdist + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx, label=method) + + # ward + method = 'ward' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + max_coph_dist = coph_dist_mat.max() + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx, label=method) + + # average + method = 'average' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + max_coph_dist = coph_dist_mat.max() + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.01, 1, 500)]) + + samples = silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx, method=method) + # end compare_methods + + else: # simplified, centroid had less clusters in example + method = 'centroid' + cluster_hierarch, coph_dist_mat = compute_hierarch_clustering( + corr_pairwdist, + method) + # plot dendrogram + plt.figure() + plt.title(method) + hierarchy.dendrogram(cluster_hierarch) + plt.savefig('figures/dendrogram.png') + # plt.show() + max_coph_dist = coph_dist_mat.max() + + npi_idx_to_cluster_idx = flatten_hierarch_clustering( + npis_corr_mat, distance.squareform( + corr_pairwdist), cluster_hierarch, + [wg * max_coph_dist + for wg in np.linspace(0.2, 1, 500)], method) # TODO reset to 0.01 to 1 with 500 samples + + silhouette(distance.squareform( + corr_pairwdist), npi_idx_to_cluster_idx) + + cluster_dict = dict() # TODO: Remove because not used? or use to write output? + cluster_codes = [[] + for i in range(len(np.unique(npi_idx_to_cluster_idx)))] + cluster_desc = [[] + for i in range(len(np.unique(npi_idx_to_cluster_idx)))] + for i in range(len(npi_idx_to_cluster_idx)): + cluster_dict[npi_codes_used[i] + ] = "CM_" + str(npi_idx_to_cluster_idx[i]).zfill(3) # use CM for "Cluster Massnahme" + cluster_codes[npi_idx_to_cluster_idx + [i]].append(npi_codes_used[i]) + cluster_desc[npi_idx_to_cluster_idx + [i]].append(str(npis_used[i])) + + # create clustered dataframe + df_npis_clustered = df_npis[[ + dd.EngEng['date'], dd.EngEng['idCounty']]].copy() + + for i in range(len(cluster_codes)): + df_npis_clustered["CM_" + str(i).zfill(3) + ] = cluster_function(df_npis[cluster_codes[i]], axis=1) + + # TODO why were 2 and 4 clustered in the example?? + npis_corr_cluster = df_npis_clustered.iloc[:, 2:].corr() + # npis_corr_cluster[abs(npis_corr_cluster)<0.25] = 0 + plt.figure() + plt.imshow(npis_corr_cluster, cmap=cmap, vmin=vmin, vmax=1) + plt.title('Absolute correlation of clustered NPIs') + plt.xlabel('NPI cluster') + plt.ylabel('NPI cluster') + plt.colorbar() + + # open file to write unused categories + if fine_resolution > 0: + if fine_resolution == 1: + filename = 'clusters_subcats_incgrouped.txt' + else: + filename = 'clusters_subcats.txt' + else: + filename = 'clusters_maincats.txt' + file_npi = open(directory + filename, 'w') + # Writing unused NPIs + for i in range(len(cluster_codes)): + file_npi.write("Cluster " + str(i) + "\n") + for j in range(len(cluster_codes[i])): + file_npi.write(cluster_codes[i][j] + ": " + cluster_desc[i][j]) + file_npi.write("\n") + file_npi.write("\n") + # Closing file + file_npi.close() + + npi_idx_new = np.argsort(npi_idx_to_cluster_idx) + npis_corr_reorder = npis_corr_mat[npi_idx_new, :][:, npi_idx_new] + + plt.figure() + plt.imshow(npis_corr_reorder, cmap=cmap, vmin=vmin, vmax=1) + plt.title('Correlation of reordered NPIs') + plt.colorbar() + + write_clustered_npis(df_npis, cluster_function, cluster_codes, + npis_corr_mat, directory, npi_codes_used) + + # npi_indices_all = set(range(npis_corr.shape[0])) + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_npis = KMeans(n_clusters=i).fit(df_npis_used.iloc[:,2:].T) + # evaluate_clustering(npis_corr, kmeans_npis.labels_, npi_indices_all) + + # for i in [40]:#[10, 20, 40, 80, 160]: + # kmeans_corr = KMeans(n_clusters=i).fit(npis_corr) + # evaluate_clustering(npis_corr, kmeans_corr.labels_, npi_indices_all) + + # corr_threshold = 0.5 + # corr_indices_threshold = np.where(npis_corr > corr_threshold) + # npis_corr_threshold = np.zeros(npis_corr.shape) + # npis_corr_threshold[corr_indices_threshold] = npis_corr[corr_indices_threshold] + # plt.imshow(npis_corr_threshold, cmap='gray_r') + + # plot share of counties that implement the main categories + if make_plot: + # plot four different subsets of curves for better distinction + j = 0 + if fine_resolution > 0: + num_images = 15 + else: + num_images = 1 + for i in [ + slice( + int(len(npi_codes_used) / num_images) * i, + min( + int(len(npi_codes_used) / num_images) * + (i + 1), + len(npis_used))) for i in range( + num_images + 1)]: + # TODO: plotList doesn't work + customPlot.plotList(df_npis_aggregated.index, + [df_npis_aggregated[code] + for code in npi_codes_used[i]], + npis_used[i], + 'Counties implementing NPI main categories', + 'Date', 'Number', "Counties_NPI_main_" + + str(j) + "_of_"+str(num_images)) + plt.tight_layout() + j += 1 + + +def write_clustered_npis(df_npis, cluster_function, cluster_codes, npis_corr, directory, npi_codes_used): + cluster_dict = dict() + # only compute cluster with more than two items + cluster_to_combine = [item for item in cluster_codes if len(item) > 1] + name_id = 0 + for cl in cluster_to_combine: + # get index of cluster items to check if they are negative or positive correlated + cluster_idx = [np.where(npi_codes_used == cl_code)[0][0] + for cl_code in cl] + for id_npi in cluster_idx: + if npis_corr[cluster_idx[0], id_npi] < 0: + # switch npis 0 -> 1 and 1 -> 0 + npi_to_switch = npi_codes_used[id_npi] + df_npis[npi_to_switch] -= 1 + df_npis[npi_to_switch] *= -1 + # copy values and delete item from dataframe + values = [df_npis[cl_code].values for cl_code in cl] + df_npis.drop(cl, axis=1, inplace=True) + # name clusters #TODO: how? For now cluster_0,1,2... + cluster_name = 'cluster_' + str(name_id) + # write cluster names in a dict (and to json) to reconstruct clusters after regression + cluster_dict[cluster_name] = cl + # TODO: how should the cluster values for the regression be calculated + cluster_value = cluster_function(np.array(values), axis=0) + df_npis[cluster_name] = cluster_value + name_id += 1 + # write npis with clusters + filename = 'clustered_npis' + file_format = 'json' + gd.write_dataframe(df_npis, directory, filename, file_format) + # write cluster dict + filename = 'cluster_description' + with open(directory+filename, 'w') as f: + print(cluster_dict, file=f) + + +def get_clustering_rki(directory, file_format): + # read in file + filename = 'npi_clustering_rki.txt' + path = os.path.join(os.path.dirname(os.path.abspath(__file__)), filename) + # read npis + df_npis = pd.read_csv(directory + 'germany_counties_npi_subcat' + ".csv") + npis = pd.read_json(os.path.join(directory, 'npis.json')) + npi_codes_considered = [ + x for x in npis[dd.EngEng['npiCode']] if len(x.split('_')) == 2] + df_merged = df_npis.iloc[:, :2] + for subcode in npi_codes_considered: + # extract columns which have the subcode as part of the column + # name and sum over all these subcodes + df_merged[subcode] = df_npis.filter( + regex=subcode).sum(axis=1) + # copy date and county data + df_out = df_npis.loc[:, ['Date', 'ID_County']] + with open(path) as file: + lines = file.readlines() + # read lines=maincodes + for line in lines: + name = line.split(':')[0] + codes = ast.literal_eval(line.split(':')[1]) + ind = 1 + for level in codes: + # if level is empty + if not len(level): + ind += 1 + continue + else: + # do not include clusters with level 1 (analagous to RKI) + if ind == 1: + ind += 1 + else: + cluster_name = name + '; Level ' + str(ind) + df_out[cluster_name] = df_merged.loc[:, [ + column for column in df_merged.columns if column in level]].sum(axis=1).values + ind += 1 + # merge maincodes as in rki + # Schools + for i in range(2, 5): # =1,2,3,4 + df_out['Schools; Level ' + + str(i)] = df_out["Primary Schools; Level "+str(i)] + # ps level 5 and not fes level 5 + df_out['Schools; Level 5'] = df_out["Primary Schools; Level 5"] + df_out['Schools; Level 5'] *= ( + df_out["Further Education Schools; Level 5"].replace([1, 0], [0, 1])) + # ps level 5 and fes level 5 + df_out['Schools; Level 6'] = df_out["Primary Schools; Level 5"] + df_out['Schools; Level 6'] *= ( + df_out["Further Education Schools; Level 5"]) + # Public Events + for i in range(2, 7): # =2,3,4,5,6 + df_out['Public Events; Level ' + + str(i)] = df_out['Public Events Indoor; Level '+str(i)] + # Sports + for i in range(2, 4): # =2,3 + df_out['Sports; Level ' + + str(i)] = df_out['Indoor Sports; Level '+str(i)] + # indoor level 4 and not outdoor level 5 + df_out['Sports; Level 4'] = df_out["Indoor Sports; Level 4"] + df_out['Sports; Level 4'] *= ( + df_out["Outdoor Sports; Level 5"].replace([1, 0], [0, 1])) + # indoor level 4 and outdoor level 5 + df_out['Sports; Level 5'] = df_out["Indoor Sports; Level 4"] + df_out['Sports; Level 5'] *= (df_out["Outdoor Sports; Level 5"]) + # drop old columns + for drop_name in ['Public Events Indoor', 'Public Events Outdoor', "Primary Schools", "Further Education Schools", "Indoor Sports", "Outdoor Sports"]: + for i in range(2, 7): + try: + df_out.drop(drop_name + '; Level ' + + str(i), inplace=True, axis=1) + except KeyError: + pass + df_out = merge_cluster_rki(df_out, 'SKBG') + filename = 'cluster_rki' + gd.write_dataframe(df_out, directory, filename, 'json') + return df_out + + +# def write_cluster_rki(df_npis, cluster_function, npis_corr, directory, npi_codes_used): +# cluster_dict = dict() +# # only compute cluster with more than two items +# # Read the JSON file +# with open(os.path.join(directory, 'npi_clustering_rki_manual.json')) as json_file: +# data = json.load(json_file) + +# # Convert JSON data to a DataFrame +# df_rki_clustering = pd.DataFrame.from_dict( +# data, orient='index').transpose() + +# cluster_codes = [list(item) for sublist in df_rki_clustering.values.tolist() +# for item in sublist] +# # remove empty lists from cluster_codes +# cluster_codes = [x for x in cluster_codes if x] + +# cluster_to_combine = [item for item in cluster_codes if len(item) > 1] +# name_id = 0 +# for cl in cluster_to_combine: +# # get index of cluster items to check if they are negative or positive correlated if they have been used +# cluster_idx = [np.where(npi_codes_used == cl_code) +# for cl_code in cl if np.where(npi_codes_used == cl_code)[0].size > 0] +# for id_npi in cluster_idx: +# if npis_corr[cluster_idx[0], id_npi] < 0: +# # switch npis 0 -> 1 and 1 -> 0 +# npi_to_switch = npi_codes_used[id_npi] +# df_npis[npi_to_switch] -= 1 +# df_npis[npi_to_switch] *= -1 +# # copy values and delete item from dataframe +# values = [df_npis[cl_code].values for cl_code in cl] +# df_npis.drop(cl, axis=1, inplace=True) +# # name clusters #TODO: how? For now cluster_0,1,2... +# cluster_name = 'cluster_' + str(name_id) +# # write cluster names in a dict (and to json) to reconstruct clusters after regression +# cluster_dict[cluster_name] = cl +# # TODO: how should the cluster values for the regression be calculated +# cluster_value = cluster_function(np.array(values), axis=0) +# df_npis[cluster_name] = cluster_value +# name_id += 1 +# # write npis with clusters +# filename = 'clustered_npis_rki' +# file_format = 'json' +# gd.write_dataframe(df_npis, directory, filename, file_format) +# # write cluster dict +# filename = 'cluster_description_rki' +# with open(directory+filename, 'w') as f: +# print(cluster_dict, file=f) + +def merge_cluster_rki(df_rki, which): + merge_list = [] + # define column names which have to be merged + if 'S' in which: + merge_list.append('Sports') + if 'K' in which: + merge_list.append('Cultural and Educational Institutions') + if 'B' in which: + merge_list.append('Accommodation') + if 'G' in which: + merge_list.append('Gastronomy') + if ('H' in which) or ('E' in which): + merge_list.append('Retail') + if 'D' in which: + merge_list.append('Services and Crafts') + column_names = [df_rki.filter(regex=name).columns.tolist() + for name in merge_list] + # put values in new df + df_merge = pd.DataFrame() + df_max_stage = df_rki.loc[:, [ + code for m in column_names for code in m if code.endswith('5')]] + i = 0 + for main_c in column_names: + df_merge[str(i)] = df_rki.loc[:, main_c].max(axis=1) + i += 1 + # delete old columns + df_rki.drop(main_c, axis=1, inplace=True) + # merge if condition (max 1,2,3...) applies + for stage in range(1, len(which)+1): + df_rki[str(stage) + ' of ' + which + + ' max'] = (df_merge.sum(axis=1) <= stage).astype(int) + df_rki[str(stage) + ' of ' + which + ' on second highest level'] = ( + df_max_stage.sum(axis=1) == stage).astype(int) + return df_rki + + +def main(): + """! Main program entry.""" + + fine_resolution = 2 + npis_final = [] + directory = os.path.join(dd.defaultDict['out_folder'], 'Germany/') + file_format = 'json' + df_rki = get_clustering_rki(directory, file_format) + npi_codes_considered = ['M01a_010', 'M01a_020', + 'M01a_100', 'M01a_110', 'M01a_120'] + + analyze_npi_data(read_data=True, make_plot=True, fine_resolution=fine_resolution, npis=npis_final, + directory=directory, file_format=file_format, npi_codes_considered=False, abs_correlation=True, cluster_function=np.mean, manual_rki_clustering=False) + + +if __name__ == "__main__": + + main() diff --git a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt new file mode 100644 index 0000000000..63060d2634 --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki.txt @@ -0,0 +1,22 @@ +Contacts in private spaces:[ [], ['M01a_020'], ['M01a_080', 'M01a_090', 'M01a_100'], ['M01a_110', 'M01a_120'], ['M01a_130', 'M01a_140', 'M01a_150'], [],] +Contacts in public spaces:[ [], ['M01b_020'], ['M01b_080', 'M01b_090', 'M01b_100'], ['M01b_110', 'M01b_120'], ['M01b_130', 'M01b_140', 'M01b_150'], [],] +Further Education Schools:[ ['M02a_010'], ['M02a_020'], ['M02a_030', 'M02a_031', 'M02a_032'], ['M02a_033', 'M02a_034', 'M02a_036'], ['M02a_035', 'M02a_040'], [],] +Primary Schools:[ ['M02b_010'], ['M02b_020'], ['M02b_030', 'M02b_031', 'M02b_032'], ['M02b_033', 'M02b_034', 'M02b_036'], ['M02b_035', 'M02b_040'], [],] +Day Care:[ ['M03_010'],['M03_020'],['M03_030'],['M03_040','M03_050'], [], ['M03_060']] +Public Events Indoor:[[],['M04_050','M04_060'],['M04_070','M04_080'],['M04_090'],['M04_100','M04_110','M04_120','M04_130'],['M04_140'],] +Public Events Outdoor:[[],['M05_050','M05_060'],['M05_070','M05_080'],['M05_090','M05_100','M05_110','M05_120'],['M05_130','M05_140','M05_150'],['M05_160'],] +Cultural and Educational Institutions:[ [], ['M06_020'], ['M06_030', 'M06_031'], ['M06_032', 'M06_033', 'M06_034'], ['M06_040'], [],] +Retail:[ [], ['M07_020'], ['M07_030'], ['M07_033', 'M07_034', 'M07_036'], ['M07_032', 'M07_031', 'M07_040'], [],] +Gastronomy:[ [], ['M08_020'], ['M08_030', 'M08_031'], ['M08_032'], ['M08_033', 'M08_040'], [],] +Services and Crafts:[ [], ['M09_020'], ['M09_030', 'M09_031'], ['M09_032'], ['M09_033', 'M09_040'], [],] +Nightlife Establishments:[ [], ['M10_020', 'M10_030', 'M10_031', 'M10_032', 'M10_033'], ['M10_040'], [], [], [],] +Accommodation:[ [], ['M11_020'], ['M11_030', 'M11_040'], [], [], [],] +Indoor Sports:[ [], ['M12_020', 'M12_030'], ['M12_040'], ['M12_070'], [], [],] +Outdoor Sports:[ [], ['M13_020'], ['M13_030'], ['M13_040'], ['M13_070'],[],] +Domestic Travel:[ ['M14_010'], ['M14_020'], ['M14_030'],[], [], [],] +International Travel:[ ['M15_010'], ['M15_020'], ['M15_030'], [], [], [],] +Masking:[ ['M16_010', 'M16_020'], ['M16_030', 'M16_040'], ['M16_050'], ['M16_060', 'M16_070', 'M16_080', 'M16_090'], ['M16_100'], [],] +Workplace:[ ['M17_010'], ['M17_020', 'M17_030'], [], [], [], [],] +Curfew:[ [], ['M18_020', 'M18_030', 'M18_040'], [], [], [], [],] +Physical Distancing:[ ['M20_010'], ['M20_020'], [], [], [], [],] +Testing:[ ['M21_010'], ['M21_020', 'M21_030'], ['M21_040', 'M21_080'], ['M21_050','M21_060','M21_070'], [], [],] diff --git a/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki_manual.json b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki_manual.json new file mode 100644 index 0000000000..f9c299e8fb --- /dev/null +++ b/pycode/memilio-epidata/memilio/epidata/npi_clustering_rki_manual.json @@ -0,0 +1,327 @@ +{ + "Contacts in private spaces": [ + [], + [ + "M01a_020" + ], + [ + "M01a_080", + "M01a_090", + "M01a_100" + ], + [ + "M01a_110", + "M01a_120" + ], + [ + "M01a_130", + "M01a_140", + "M01a_150" + ], + [] + ], + "Contacts in public spaces": [ + [], + [ + "M01b_020" + ], + [ + "M01b_080", + "M01b_090", + "M01b_100" + ], + [ + "M01b_110", + "M01b_120" + ], + [ + "M01b_130", + "M01b_140", + "M01b_150" + ], + [] + ], + "Further Education Schools": [ + [], + [ + "M02a_010" + ], + [ + "M02a_020" + ], + [ + "M02a_030", + "M02a_031", + "M02a_032", + "M02a_033", + "M02a_034", + "M02a_035", + "M02a_040" + ], + [], + [] + ], + "Primary Schools": [ + [], + [ + "M02b_010" + ], + [ + "M02b_020" + ], + [ + "M02b_030", + "M02b_031", + "M02b_032", + "M02b_033", + "M02b_034", + "M02b_036", + "M02b_035", + "M02b_040" + ], + [], + [] + ], + "Cultural and Educational Institutions": [ + [], + [ + "M06_020" + ], + [ + "M06_030", + "M06_031", + "M06_032", + "M06_033", + "M06_034" + ], + [ + "M06_040" + ], + [], + [] + ], + "Retail": [ + [], + [ + "M07_020" + ], + [ + "M07_030", + "M07_031", + "M07_032", + "M07_033", + "M07_034", + "M07_036" + ], + [ + "M07_040" + ], + [], + [] + ], + "Hospitality": [ + [], + [ + "M08_020" + ], + [ + "M08_030", + "M08_031", + "M08_032", + "M08_033" + ], + [ + "M08_040" + ], + [], + [] + ], + "Services and Crafts": [ + [], + [ + "M09_020" + ], + [ + "M09_030", + "M09_031", + "M09_032", + "M09_033" + ], + [ + "M09_040" + ], + [], + [] + ], + "Nightlife Establishments": [ + [], + [ + "M10_020" + ], + [ + "M10_030", + "M10_031", + "M10_032", + "M10_033" + ], + [ + "M10_040" + ], + [], + [] + ], + "Accommodation": [ + [], + [ + "M11_020" + ], + [ + "M11_030" + ], + [ + "M11_040" + ], + [], + [] + ], + "Indoor Sports": [ + [], + [ + "M12_020" + ], + [ + "M12_030" + ], + [ + "M12_040" + ], + [ + "M12_070" + ], + [] + ], + "Outdoor Sports": [ + [], + [ + "M13_020" + ], + [ + "M13_030" + ], + [ + "M13_040" + ], + [ + "M13_070" + ], + [] + ], + "Domestic Travel": [ + [], + [ + "M14_010" + ], + [ + "M14_020" + ], + [ + "M14_030" + ], + [], + [] + ], + "International Travel": [ + [], + [ + "M15_010" + ], + [ + "M15_020" + ], + [ + "M15_030" + ], + [], + [] + ], + "Masking": [ + [ + "M16_010", + "M16_020" + ], + [ + "M16_030", + "M16_040", + "M16_050", + "M16_060" + ], + [ + "M16_070", + "M16_080", + "M16_090" + ], + [], + [], + [ + "M16_100" + ] + ], + "Workplace": [ + [], + [ + "M17_010" + ], + [ + "M17_020" + ], + [ + "M17_030" + ], + [], + [] + ], + "Curfew": [ + [], + [ + "M18_020" + ], + [ + "M18_030" + ], + [ + "M18_040" + ], + [], + [] + ], + "Physical Distancing": [ + [ + "M20_010" + ], + [ + "M20_020" + ], + [], + [], + [], + [] + ], + "Testing": [ + [ + "M21_010" + ], + [ + "M21_020" + ], + [ + "M21_030" + ], + [ + "M21_040" + ], + [ + "M21_050" + ], + [] + ] +} \ No newline at end of file