diff --git a/modern_1_intro.ipynb b/modern_1_intro.ipynb index 5b104f4..1fefa2e 100644 --- a/modern_1_intro.ipynb +++ b/modern_1_intro.ipynb @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -43,8 +43,7 @@ "import requests\n", "import numpy as np\n", "import pandas as pd\n", - "import seaborn as sns\n", - "import matplotlib.pyplot as plt\n", + "\n", "\n", "if int(os.environ.get(\"MODERN_PANDAS_EPUB\", 0)):\n", " import prep" @@ -52,33 +51,30 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jj/anaconda3/envs/air38/lib/python3.9/site-packages/urllib3/connectionpool.py:1045: InsecureRequestWarning: Unverified HTTPS request is being made to host 'transtats.bts.gov'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/1.26.x/advanced-usage.html#ssl-warnings\n", + " warnings.warn(\n" + ] + } + ], "source": [ "import requests\n", "\n", - "headers = {\n", - " 'Referer': 'https://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time',\n", - " 'Origin': 'https://www.transtats.bts.gov',\n", - " 'Content-Type': 'application/x-www-form-urlencoded',\n", - "}\n", - "\n", - "params = (\n", - " ('Table_ID', '236'),\n", - " ('Has_Group', '3'),\n", - " ('Is_Zipped', '0'),\n", - ")\n", - "\n", - "with open('modern-1-url.txt', encoding='utf-8') as f:\n", - " data = f.read().strip()\n", "\n", "os.makedirs('data', exist_ok=True)\n", "dest = \"data/flights.csv.zip\"\n", "\n", "if not os.path.exists(dest):\n", - " r = requests.post('https://www.transtats.bts.gov/DownLoad_Table.asp',\n", - " headers=headers, params=params, data=data, stream=True)\n", + " r = requests.get('https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2022_1.zip', \n", + " verify=False,\n", + " stream=True)\n", + "\n", "\n", " with open(\"data/flights.csv.zip\", 'wb') as f:\n", " for chunk in r.iter_content(chunk_size=102400): \n", @@ -97,59 +93,87 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "\n", - "RangeIndex: 450017 entries, 0 to 450016\n", - "Data columns (total 33 columns):\n", - "fl_date 450017 non-null datetime64[ns]\n", - "unique_carrier 450017 non-null object\n", - "airline_id 450017 non-null int64\n", - "tail_num 449378 non-null object\n", - "fl_num 450017 non-null int64\n", - "origin_airport_id 450017 non-null int64\n", - "origin_airport_seq_id 450017 non-null int64\n", - "origin_city_market_id 450017 non-null int64\n", - "origin 450017 non-null object\n", - "origin_city_name 450017 non-null object\n", - "dest_airport_id 450017 non-null int64\n", - "dest_airport_seq_id 450017 non-null int64\n", - "dest_city_market_id 450017 non-null int64\n", - "dest 450017 non-null object\n", - "dest_city_name 450017 non-null object\n", - "crs_dep_time 450017 non-null int64\n", - "dep_time 441476 non-null float64\n", - "dep_delay 441476 non-null float64\n", - "taxi_out 441244 non-null float64\n", - "wheels_off 441244 non-null float64\n", - "wheels_on 440746 non-null float64\n", - "taxi_in 440746 non-null float64\n", - "crs_arr_time 450017 non-null int64\n", - "arr_time 440746 non-null float64\n", - "arr_delay 439645 non-null float64\n", - "cancelled 450017 non-null float64\n", - "cancellation_code 8886 non-null object\n", - "carrier_delay 97699 non-null float64\n", - "weather_delay 97699 non-null float64\n", - "nas_delay 97699 non-null float64\n", - "security_delay 97699 non-null float64\n", - "late_aircraft_delay 97699 non-null float64\n", - "unnamed: 32 0 non-null float64\n", - "dtypes: datetime64[ns](1), float64(15), int64(10), object(7)\n", - "memory usage: 113.3+ MB\n" + "/tmp/ipykernel_398669/1703248179.py:3: DtypeWarning: Columns (76,77,84) have mixed types. Specify dtype option on import or set low_memory=False.\n", + " df = pd.read_csv(fp, parse_dates=[\"FlightDate\"]).rename(columns=str.lower)\n" ] } ], "source": [ "zf = zipfile.ZipFile(\"data/flights.csv.zip\")\n", "fp = zf.extract(zf.filelist[0].filename, path='data/')\n", - "df = pd.read_csv(fp, parse_dates=[\"FL_DATE\"]).rename(columns=str.lower)\n", + "df = pd.read_csv(fp, parse_dates=[\"FlightDate\"]).rename(columns=str.lower)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "#converting the names from the current version to the one used in the book\n", + "cols = {\n", + " 'flightdate': 'fl_date', \n", + " 'iata_code_reporting_airline': 'unique_carrier',\n", + " 'reporting_airline': 'airline_id',\n", + " 'tail_number': 'tail_num',\n", + " 'flight_number_reporting_airline': 'fl_num',\n", + " 'originairportid': 'origin_airport_id',\n", + " 'originairportseqid': 'origin_airport_seq_id',\n", + " 'origincitymarketid': 'origin_city_market_id',\n", + " 'origincityname': 'origin_city_name',\n", + " 'destairportid': 'dest_airport_id',\n", + " 'destairportseqid': 'dest_airport_seq_id',\n", + " 'destcitymarketid': 'dest_city_market_id',\n", + " 'dest': 'dest',\n", + " 'destcityname': 'dest_city_name',\n", + " 'crsdeptime': 'crs_dep_time',\n", + " 'deptime': 'dep_time',\n", + " 'depdelay': 'dep_delay',\n", + " 'taxiout': 'taxi_out',\n", + " 'wheelsoff': 'wheels_off',\n", + " 'wheelson': 'wheels_on',\n", + " 'taxiin': 'taxi_in',\n", + " 'crsarrtime': 'crs_arr_time',\n", + " 'arrtime': 'arr_time',\n", + " 'arrdelay': 'arr_delay',\n", + " 'cancelled': 'cancelled',\n", + " 'cancellationcode': 'cancellation_code',\n", + " 'carrierdelay': 'carrier_delay',\n", + " 'weatherdelay': 'weather_delay',\n", + " 'nasdelay': 'nas_delay',\n", + " 'securitydelay': 'security_delay',\n", + " 'lateaircraftdelay': 'late_aircraft_delay'\n", + "}\n", + " \n", "\n", + "df.rename(columns=cols, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 537902 entries, 0 to 537901\n", + "Columns: 110 entries, year to unnamed: 109\n", + "dtypes: datetime64[ns](1), float64(70), int64(21), object(18)\n", + "memory usage: 451.4+ MB\n" + ] + } + ], + "source": [ "df.info()" ] }, @@ -877,14 +901,27 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", - "\n", + "\n", + "
\n", " \n", " \n", " \n", @@ -892,10 +929,10 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -911,77 +948,70 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
airline_idfl_numorigin_airport_idorigin_airport_seq_idyearquartermonthdayofmonth
unique_carrier
AAABQDFWN200AA2014-01-061980516621014010140029EABEATLN138EV2022-01-2120221121
2014-01-271980510901014010140022022-01-2520221125
N202AA2014-01-27198051332101401014002N147PQ2022-01-032022113
N426AA2014-01-09198051662101401014002N181GJ2022-01-092022119
2014-01-15198051467101401014002N195PQ2022-01-2420221124
\n", "
" ], "text/plain": [ - " airline_id fl_num \\\n", - "unique_carrier origin dest tail_num fl_date \n", - "AA ABQ DFW N200AA 2014-01-06 19805 1662 \n", - " 2014-01-27 19805 1090 \n", - " N202AA 2014-01-27 19805 1332 \n", - " N426AA 2014-01-09 19805 1662 \n", - " 2014-01-15 19805 1467 \n", - "\n", - " origin_airport_id \\\n", - "unique_carrier origin dest tail_num fl_date \n", - "AA ABQ DFW N200AA 2014-01-06 10140 \n", - " 2014-01-27 10140 \n", - " N202AA 2014-01-27 10140 \n", - " N426AA 2014-01-09 10140 \n", - " 2014-01-15 10140 \n", - "\n", - " origin_airport_seq_id \n", + " year quarter month \\\n", "unique_carrier origin dest tail_num fl_date \n", - "AA ABQ DFW N200AA 2014-01-06 1014002 \n", - " 2014-01-27 1014002 \n", - " N202AA 2014-01-27 1014002 \n", - " N426AA 2014-01-09 1014002 \n", - " 2014-01-15 1014002 " + "9E ABE ATL N138EV 2022-01-21 2022 1 1 \n", + " 2022-01-25 2022 1 1 \n", + " N147PQ 2022-01-03 2022 1 1 \n", + " N181GJ 2022-01-09 2022 1 1 \n", + " N195PQ 2022-01-24 2022 1 1 \n", + "\n", + " dayofmonth \n", + "unique_carrier origin dest tail_num fl_date \n", + "9E ABE ATL N138EV 2022-01-21 21 \n", + " 2022-01-25 25 \n", + " N147PQ 2022-01-03 3 \n", + " N181GJ 2022-01-09 9 \n", + " N195PQ 2022-01-24 24 " ] }, - "execution_count": 14, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -1659,7 +1689,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1673,7 +1703,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.1" + "version": "3.9.13" } }, "nbformat": 4,