-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel.py
161 lines (141 loc) · 4.48 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import pandas as pd
from sklearn import preprocessing
import os.path
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import plotly.plotly as py
import random
directory_path = ""
data_after_pre_procsess = pd.DataFrame()
countries = pd.DataFrame()
'''
read excel file and insert the data into dataFrame
@:param path path for data file
@:return dataFrame of data without the column year
'''
def read_file(path):
df = pd.read_excel(path)
if len(df) < 1:
raise ValueError('empty file')
df.drop(['year'], inplace=True, axis=1)
return df
'''
complete missing values according to column mean value
@:param dataFrame with data
'''
def replace_na(df):
for column in df:
if df[column].dtype == 'float64':
df[column].fillna((df[column].mean()), inplace=True)
'''
for each numeric column, change the value into the standard value
@:param dataFrame with data
'''
#Standardization
def standardization(df):
for column in df:
if df[column].dtype == 'float64':
preprocessing.scale(df[column], axis=0, with_mean=True, with_std=True, copy=False)
'''
create new dataframe containing for each country the avg values of the country
@:param df dataFrame with data
@:return dataFrame after group by
'''
def data_grouping(df):
return df.groupby('country', as_index=False).agg('mean')
'''
replace empty values, change values to standad values and group by
@:param df dataFrame with data
@:return dataFrame after cleaning the data
'''
def clean_data(df):
replace_na(df)
standardization(df)
df = data_grouping(df)
return df
'''
preform pre process on data
@:param path path for data file
'''
def pre_process(path):
global directory_path
global data_after_pre_procsess
global countries
df = read_file(path)
df = clean_data(df)
countries = df['country']
df.index = df['country']
df.drop(['country'], inplace=True, axis=1)
directory_path = os.path.abspath(os.path.join(path, os.pardir))
data_after_pre_procsess = df
'''
run k mean model on data according to user parameters
@:param num_of_clusters number of clusters according to user
@:param num_of_runs number of runs according to user
'''
def k_means(num_of_clusters, num_of_runs):
try:
if 'prediction' in data_after_pre_procsess.columns:
data_after_pre_procsess.drop(['prediction'], inplace=True, axis=1)
k_means_model = KMeans(n_clusters=num_of_clusters, init='random', n_init=num_of_runs)
k_means_model.fit(data_after_pre_procsess)
data_after_pre_procsess['prediction'] = k_means_model.labels_
except:
raise ValueError('data not in the right format')
'''
plot scatter graph of Generosity as a function of Social support.
save image of the graph in the chosen folder
@return path of the image
'''
def plot_scatter():
x = data_after_pre_procsess['Social support']
y = data_after_pre_procsess['Generosity']
colors = ["#" + ''.join([random.choice('0123456789ABCDEF') for j in range(6)]) for i in range(data_after_pre_procsess['prediction'].nunique())]
plt.scatter(x, y ,c=colors, alpha=1)
plt.title('K Means Clsustering')
plt.xlabel('Social support')
plt.ylabel('Generosity')
path_to_img = directory_path + '/scatter_img.png'
plt.savefig(path_to_img)
return path_to_img
'''
plot choropleth graph according to records classification.
save image of the graph in the chosen folder
@return path of the image
'''
def plot_map():
# Define the data to be visualised and some of the parameters of the visualisation
data = [dict(
type='choropleth',
locations=countries,
locationmode='country names',
z=data_after_pre_procsess['prediction'].astype(str),
text=countries.astype(str),
colorscale= 'Rainbow',
autocolorscale=False,
reversescale=True,
marker=dict(
line=dict(
color='rgb(180,180,180)',
width=0.5
)),
colorbar=dict(
autotick=False,
title='prediction'),
)]
layout = dict(
title='K Means Clustering',
geo=dict(
showframe=True,
showcoastlines=True,
projection=dict(
type='Mercator'
)
)
)
# Plot
fig = dict(data=data, layout=layout)
py.sign_in(username='amitmag', api_key='UzC0vg747jN3LLGYMgJc')
path_to_img = directory_path + '/map_img.png'
py.image.save_as(fig ,filename= path_to_img)
return path_to_img