-
-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathtemp.py
48 lines (45 loc) · 2.1 KB
/
temp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import pandas as pd
def read_data(option='daily'):
'''Reads in a dataset with daily recorded temperatures for major cities of
the world see
https://www.kaggle.com/sudalairajkumar/daily-temperature-of-major-cities
Returns:
df: dataframe with columns region, country, city, date
and temperature
'''
index_names = ['region', 'country', 'city', 'date']
df = (pd.read_csv('city_temperature.csv', na_values=[-99],
low_memory=False)
.rename(str.lower, axis='columns')
.loc[lambda x: (x.day != 0) & (x.year != 200) & (x.year != 201)]
.drop(['state'], axis=1) # day 0, year; 200, 201 seem errors
.assign(region=lambda x: x.region.astype('category'),
country=lambda x: x.country.astype('category'),
city=lambda x: x.city.astype('category'))
.assign(avgtemperature=(
lambda x: x.avgtemperature.fillna(method='ffill')))
.assign(temperature=lambda x: (x.avgtemperature-32)*5/9)
.drop(['avgtemperature'], axis=1) # Fahrenheit to Celcius
.assign(date=lambda x: pd.to_datetime(x[['year', 'month', 'day']],
errors='coerce'))
.drop(['year', 'month', 'day'], axis=1)
.drop_duplicates(subset=index_names) # NOTE: ideally take mean
.dropna()
.set_index(index_names)
.sort_index(level=index_names)
)
if not option:
options = ['monthly', 'biweekly', 'daily']
print("Please choose:")
for idx, element in enumerate(options):
print("{}) {}".format(idx, element))
choice = options[int(input("Enter number: "))]
if choice == 'monthly':
df = (df.groupby(['city', pd.Grouper(level='date', freq='m')])
.mean()
)
elif choice == 'biweekly':
df = (df.groupby(['city', pd.Grouper(level='date', freq='2W')])
.mean()
)
return df