Skip to content

Commit

Permalink
refactoring for data
Browse files Browse the repository at this point in the history
  • Loading branch information
Remi Tschupp committed Jun 25, 2024
1 parent 9da0ce1 commit f4856c5
Showing 1 changed file with 23 additions and 16 deletions.
39 changes: 23 additions & 16 deletions titanic/titanic_data_handling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,21 @@
"## EDA"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Only labelled train file will be used to train our model. The other file could be used for inference test to show the capacities of the model."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"train = pd.read_csv(os.path.join(gen_dirname,r\"data\\train.csv\"))\n",
"test = pd.read_csv(os.path.join(gen_dirname,r\"data\\test.csv\"))"
"labelled_train = pd.read_csv(os.path.join(gen_dirname,r\"data\\labelled_train.csv\"))\n",
"unlabelled_test = pd.read_csv(os.path.join(gen_dirname,r\"data\\unlabelled_test.csv\"))"
]
},
{
Expand Down Expand Up @@ -185,7 +192,7 @@
}
],
"source": [
"train.head()"
"labelled_train.head()"
]
},
{
Expand Down Expand Up @@ -238,8 +245,8 @@
}
],
"source": [
"train.info()\n",
"test.info()"
"labelled_train.info()\n",
"unlabelled_test.info()"
]
},
{
Expand Down Expand Up @@ -279,8 +286,8 @@
"metadata": {},
"outputs": [],
"source": [
"train_rough = train.drop(features_to_drop_rough,axis=\"columns\")\n",
"test_rough = test.drop(features_to_drop_rough,axis=\"columns\")"
"train_rough = labelled_train.drop(features_to_drop_rough,axis=\"columns\")\n",
"test_rough = unlabelled_test.drop(features_to_drop_rough,axis=\"columns\")"
]
},
{
Expand Down Expand Up @@ -340,7 +347,7 @@
}
],
"source": [
"test_rough.loc[test[\"Fare\"].isnull()]"
"test_rough.loc[unlabelled_test[\"Fare\"].isnull()]"
]
},
{
Expand All @@ -356,7 +363,7 @@
"metadata": {},
"outputs": [],
"source": [
"test_rough[\"Fare\"] = test_rough[\"Fare\"].fillna(test[\"Fare\"].mean())"
"test_rough[\"Fare\"] = test_rough[\"Fare\"].fillna(unlabelled_test[\"Fare\"].mean())"
]
},
{
Expand All @@ -372,8 +379,8 @@
"metadata": {},
"outputs": [],
"source": [
"train_gentle = train.drop(features_to_drop_gentle,axis=\"columns\")\n",
"test_gentle = test.drop(features_to_drop_gentle,axis=\"columns\")"
"train_gentle = labelled_train.drop(features_to_drop_gentle,axis=\"columns\")\n",
"test_gentle = unlabelled_test.drop(features_to_drop_gentle,axis=\"columns\")"
]
},
{
Expand All @@ -393,7 +400,7 @@
"metadata": {},
"outputs": [],
"source": [
"test_gentle[\"Fare\"] = test_gentle[\"Fare\"].fillna(test[\"Fare\"].mean())"
"test_gentle[\"Fare\"] = test_gentle[\"Fare\"].fillna(unlabelled_test[\"Fare\"].mean())"
]
},
{
Expand Down Expand Up @@ -443,8 +450,8 @@
"source": [
"save_folder = os.path.join(gen_dirname,r\"data\\rough\")\n",
"os.makedirs(save_folder,exist_ok = True)\n",
"train_rough.to_csv(os.path.join(save_folder,\"train.csv\"),index=False)\n",
"test_rough.to_csv(os.path.join(save_folder,\"test.csv\"),index=False)"
"train_rough.to_csv(os.path.join(save_folder,\"labelled.csv\"),index=False)\n",
"test_rough.to_csv(os.path.join(save_folder,\"unlabelled.csv\"),index=False)"
]
},
{
Expand All @@ -455,8 +462,8 @@
"source": [
"save_folder = os.path.join(gen_dirname,r\"data\\gentle\")\n",
"os.makedirs(save_folder,exist_ok = True)\n",
"train_gentle.to_csv(os.path.join(save_folder,\"train.csv\"),index=False)\n",
"test_gentle.to_csv(os.path.join(save_folder,\"test.csv\"),index=False)"
"train_gentle.to_csv(os.path.join(save_folder,\"labelled.csv\"),index=False)\n",
"test_gentle.to_csv(os.path.join(save_folder,\"unlabelled.csv\"),index=False)"
]
}
],
Expand Down

0 comments on commit f4856c5

Please sign in to comment.