Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fixing url issues and updating column names #38

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 154 additions & 124 deletions modern_1_intro.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -43,42 +43,38 @@
"import requests\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"if int(os.environ.get(\"MODERN_PANDAS_EPUB\", 0)):\n",
" import prep"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/jj/anaconda3/envs/air38/lib/python3.9/site-packages/urllib3/connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'transtats.bts.gov'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n",
" warnings.warn(\n"
]
}
],
"source": [
"import requests\n",
"\n",
"headers = {\n",
" 'Referer': 'https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time',\n",
" 'Origin': 'https://www.transtats.bts.gov',\n",
" 'Content-Type': 'application/x-www-form-urlencoded',\n",
"}\n",
"\n",
"params = (\n",
" ('Table_ID', '236'),\n",
" ('Has_Group', '3'),\n",
" ('Is_Zipped', '0'),\n",
")\n",
"\n",
"with open('modern-1-url.txt', encoding='utf-8') as f:\n",
" data = f.read().strip()\n",
"\n",
"os.makedirs('data', exist_ok=True)\n",
"dest = \"data/flights.csv.zip\"\n",
"\n",
"if not os.path.exists(dest):\n",
" r = requests.post('https://www.transtats.bts.gov/DownLoad_Table.asp',\n",
" headers=headers, params=params, data=data, stream=True)\n",
" r = requests.get('https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2022_1.zip', \n",
" verify=False,\n",
" stream=True)\n",
"\n",
"\n",
" with open(\"data/flights.csv.zip\", 'wb') as f:\n",
" for chunk in r.iter_content(chunk_size=102400): \n",
Expand All @@ -97,59 +93,87 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"name": "stderr",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 450017 entries, 0 to 450016\n",
"Data columns (total 33 columns):\n",
"fl_date 450017 non-null datetime64[ns]\n",
"unique_carrier 450017 non-null object\n",
"airline_id 450017 non-null int64\n",
"tail_num 449378 non-null object\n",
"fl_num 450017 non-null int64\n",
"origin_airport_id 450017 non-null int64\n",
"origin_airport_seq_id 450017 non-null int64\n",
"origin_city_market_id 450017 non-null int64\n",
"origin 450017 non-null object\n",
"origin_city_name 450017 non-null object\n",
"dest_airport_id 450017 non-null int64\n",
"dest_airport_seq_id 450017 non-null int64\n",
"dest_city_market_id 450017 non-null int64\n",
"dest 450017 non-null object\n",
"dest_city_name 450017 non-null object\n",
"crs_dep_time 450017 non-null int64\n",
"dep_time 441476 non-null float64\n",
"dep_delay 441476 non-null float64\n",
"taxi_out 441244 non-null float64\n",
"wheels_off 441244 non-null float64\n",
"wheels_on 440746 non-null float64\n",
"taxi_in 440746 non-null float64\n",
"crs_arr_time 450017 non-null int64\n",
"arr_time 440746 non-null float64\n",
"arr_delay 439645 non-null float64\n",
"cancelled 450017 non-null float64\n",
"cancellation_code 8886 non-null object\n",
"carrier_delay 97699 non-null float64\n",
"weather_delay 97699 non-null float64\n",
"nas_delay 97699 non-null float64\n",
"security_delay 97699 non-null float64\n",
"late_aircraft_delay 97699 non-null float64\n",
"unnamed: 32 0 non-null float64\n",
"dtypes: datetime64[ns](1), float64(15), int64(10), object(7)\n",
"memory usage: 113.3+ MB\n"
"/tmp/ipykernel_398669/1703248179.py:3: DtypeWarning: Columns (76,77,84) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" df = pd.read_csv(fp, parse_dates=[\"FlightDate\"]).rename(columns=str.lower)\n"
]
}
],
"source": [
"zf = zipfile.ZipFile(\"data/flights.csv.zip\")\n",
"fp = zf.extract(zf.filelist[0].filename, path='data/')\n",
"df = pd.read_csv(fp, parse_dates=[\"FL_DATE\"]).rename(columns=str.lower)\n",
"df = pd.read_csv(fp, parse_dates=[\"FlightDate\"]).rename(columns=str.lower)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"#converting the names from the current version to the one used in the book\n",
"cols = {\n",
" 'flightdate': 'fl_date', \n",
" 'iata_code_reporting_airline': 'unique_carrier',\n",
" 'reporting_airline': 'airline_id',\n",
" 'tail_number': 'tail_num',\n",
" 'flight_number_reporting_airline': 'fl_num',\n",
" 'originairportid': 'origin_airport_id',\n",
" 'originairportseqid': 'origin_airport_seq_id',\n",
" 'origincitymarketid': 'origin_city_market_id',\n",
" 'origincityname': 'origin_city_name',\n",
" 'destairportid': 'dest_airport_id',\n",
" 'destairportseqid': 'dest_airport_seq_id',\n",
" 'destcitymarketid': 'dest_city_market_id',\n",
" 'dest': 'dest',\n",
" 'destcityname': 'dest_city_name',\n",
" 'crsdeptime': 'crs_dep_time',\n",
" 'deptime': 'dep_time',\n",
" 'depdelay': 'dep_delay',\n",
" 'taxiout': 'taxi_out',\n",
" 'wheelsoff': 'wheels_off',\n",
" 'wheelson': 'wheels_on',\n",
" 'taxiin': 'taxi_in',\n",
" 'crsarrtime': 'crs_arr_time',\n",
" 'arrtime': 'arr_time',\n",
" 'arrdelay': 'arr_delay',\n",
" 'cancelled': 'cancelled',\n",
" 'cancellationcode': 'cancellation_code',\n",
" 'carrierdelay': 'carrier_delay',\n",
" 'weatherdelay': 'weather_delay',\n",
" 'nasdelay': 'nas_delay',\n",
" 'securitydelay': 'security_delay',\n",
" 'lateaircraftdelay': 'late_aircraft_delay'\n",
"}\n",
" \n",
"\n",
"df.rename(columns=cols, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 537902 entries, 0 to 537901\n",
"Columns: 110 entries, year to unnamed: 109\n",
"dtypes: datetime64[ns](1), float64(70), int64(21), object(18)\n",
"memory usage: 451.4+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
Expand Down Expand Up @@ -877,25 +901,38 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"0\" class=\"dataframe\">\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th>airline_id</th>\n",
" <th>fl_num</th>\n",
" <th>origin_airport_id</th>\n",
" <th>origin_airport_seq_id</th>\n",
" <th>year</th>\n",
" <th>quarter</th>\n",
" <th>month</th>\n",
" <th>dayofmonth</th>\n",
" </tr>\n",
" <tr>\n",
" <th>unique_carrier</th>\n",
Expand All @@ -911,77 +948,70 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th rowspan=\"5\" valign=\"top\">AA</th>\n",
" <th rowspan=\"5\" valign=\"top\">ABQ</th>\n",
" <th rowspan=\"5\" valign=\"top\">DFW</th>\n",
" <th rowspan=\"2\" valign=\"top\">N200AA</th>\n",
" <th>2014-01-06</th>\n",
" <td>19805</td>\n",
" <td>1662</td>\n",
" <td>10140</td>\n",
" <td>1014002</td>\n",
" <th rowspan=\"5\" valign=\"top\">9E</th>\n",
" <th rowspan=\"5\" valign=\"top\">ABE</th>\n",
" <th rowspan=\"5\" valign=\"top\">ATL</th>\n",
" <th rowspan=\"2\" valign=\"top\">N138EV</th>\n",
" <th>2022-01-21</th>\n",
" <td>2022</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2014-01-27</th>\n",
" <td>19805</td>\n",
" <td>1090</td>\n",
" <td>10140</td>\n",
" <td>1014002</td>\n",
" <th>2022-01-25</th>\n",
" <td>2022</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>N202AA</th>\n",
" <th>2014-01-27</th>\n",
" <td>19805</td>\n",
" <td>1332</td>\n",
" <td>10140</td>\n",
" <td>1014002</td>\n",
" <th>N147PQ</th>\n",
" <th>2022-01-03</th>\n",
" <td>2022</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th rowspan=\"2\" valign=\"top\">N426AA</th>\n",
" <th>2014-01-09</th>\n",
" <td>19805</td>\n",
" <td>1662</td>\n",
" <td>10140</td>\n",
" <td>1014002</td>\n",
" <th>N181GJ</th>\n",
" <th>2022-01-09</th>\n",
" <td>2022</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2014-01-15</th>\n",
" <td>19805</td>\n",
" <td>1467</td>\n",
" <td>10140</td>\n",
" <td>1014002</td>\n",
" <th>N195PQ</th>\n",
" <th>2022-01-24</th>\n",
" <td>2022</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>24</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" airline_id fl_num \\\n",
"unique_carrier origin dest tail_num fl_date \n",
"AA ABQ DFW N200AA 2014-01-06 19805 1662 \n",
" 2014-01-27 19805 1090 \n",
" N202AA 2014-01-27 19805 1332 \n",
" N426AA 2014-01-09 19805 1662 \n",
" 2014-01-15 19805 1467 \n",
"\n",
" origin_airport_id \\\n",
"unique_carrier origin dest tail_num fl_date \n",
"AA ABQ DFW N200AA 2014-01-06 10140 \n",
" 2014-01-27 10140 \n",
" N202AA 2014-01-27 10140 \n",
" N426AA 2014-01-09 10140 \n",
" 2014-01-15 10140 \n",
"\n",
" origin_airport_seq_id \n",
" year quarter month \\\n",
"unique_carrier origin dest tail_num fl_date \n",
"AA ABQ DFW N200AA 2014-01-06 1014002 \n",
" 2014-01-27 1014002 \n",
" N202AA 2014-01-27 1014002 \n",
" N426AA 2014-01-09 1014002 \n",
" 2014-01-15 1014002 "
"9E ABE ATL N138EV 2022-01-21 2022 1 1 \n",
" 2022-01-25 2022 1 1 \n",
" N147PQ 2022-01-03 2022 1 1 \n",
" N181GJ 2022-01-09 2022 1 1 \n",
" N195PQ 2022-01-24 2022 1 1 \n",
"\n",
" dayofmonth \n",
"unique_carrier origin dest tail_num fl_date \n",
"9E ABE ATL N138EV 2022-01-21 21 \n",
" 2022-01-25 25 \n",
" N147PQ 2022-01-03 3 \n",
" N181GJ 2022-01-09 9 \n",
" N195PQ 2022-01-24 24 "
]
},
"execution_count": 14,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -1659,7 +1689,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -1673,7 +1703,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
"version": "3.9.13"
}
},
"nbformat": 4,
Expand Down