-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stage 1 - Lesson 5.py
160 lines (83 loc) · 3.13 KB
/
Stage 1 - Lesson 5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/usr/bin/env python
# coding: utf-8
# In[1]:
#conversion to csv
import pandas as pd
fuel_data = pd.read_csv('https://raw.githubusercontent.com/WalePhenomenon/climate_change/master/fuel_ferc1.csv')
fuel_data.to_csv('fuel_ferc1.csv', index=False)
# In[2]:
#reading data and description
import pandas as pd
fuel_data = pd.read_csv('fuel_ferc1.csv')
fuel_data.describe(include='all')
# In[3]:
#check for missing values
fuel_data.isnull().sum()
# In[4]:
#use groupby to count the sum of each unique value in the fuel unit column
fuel_data.groupby('fuel_unit')['fuel_unit'].count()
fuel_data[['fuel_unit']] = fuel_data[['fuel_unit']].fillna(value='mcf')
# In[5]:
#check if missing values have been filled
fuel_data.isnull().sum()
fuel_data.groupby('report_year')['report_year'].count()
# In[6]:
#group by the fuel type code year and print the first entries in all the groups formed
fuel_data.groupby('fuel_type_code_pudl').first()
# In[9]:
fuel_df1 = fuel_data.iloc[0:1900].reset_index(drop=True)
fuel_df2 = fuel_data.iloc[1900:].reset_index(drop=True)
# In[11]:
#check that the length of both dataframes sum to the expected length
assert len(fuel_data) == (len(fuel_df1) + len(fuel_df2))
# In[12]:
#outer merge returns all rows in both dataframes
pd.merge(fuel_df1, fuel_df2, how="inner")
# In[13]:
#outer merge returns all rows in both dataframes
pd.merge(fuel_df1, fuel_df2, how="outer")
# In[14]:
#removes rows from the right dataframe that don not have a match with the left
#and keeps all rows from the left
pd.merge(fuel_df1, fuel_df2, how="left")
# In[15]:
#check for duplicate rows
fuel_data.duplicated().any()
# In[17]:
#Import plotting library
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(7,4))
plt.xticks(rotation=90)
fuel_unit = pd.DataFrame({'unit':['BBL', 'GAL', 'GRAMSU', 'KGU', 'MCF', 'MMBTU', 'MWDTH', 'MWHTH', 'TON'],
'count':[7998, 84, 464, 110, 11354, 180, 95, 100, 8958]
})
sns.barplot(data=fuel_unit, x='unit', y='count')
plt.xlabel('Fuel Unit'),
#Because of the extreme range of the values for the fuel unit, we can plot the barchart by taking the logarithm of the y-axis as follows:
g = sns.barplot(data=fuel_unit, x='unit', y='count')
g.set_yscale("log")
g.set_ylim(1, 12000)
plt.xlabel('Fuel Unit'),
# In[34]:
import pandas as pd
# Import plotting library
import seaborn as sns
fuel_data = pd.read_csv('fuel_ferc1.csv')
# Box plot
sns.boxplot(x="fuel_type_code_pudl", y="utility_id_ferc1",
palette=["m", "g"], data=fuel_data)
# In[7]:
# Select a sample of the dataset
sample_df = fuel_data.sample(n=50, random_state=4)
sns.regplot(x=sample_df["utility_id_ferc1"], y=sample_df["fuel_cost_per_mmbtu"], fit_reg=False)
# In[5]:
#KDE Plotting
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
fuel_data = pd.read_csv('fuel_ferc1.csv')
sample_df = pd.DataFrame(fuel_data.sample(n=50, random_state=7), columns=['fuel_cost_per_unit_burned'])
sns.kdeplot(sample_df['fuel_cost_per_unit_burned'], shade=True, color="b")
plt.xlabel('sample_df')
# In[ ]: