From 3c47c762430926fdccdadae43bffc091929f0b3a Mon Sep 17 00:00:00 2001 From: Felipe Olmos <92923444+folmos-at-orange@users.noreply.github.com> Date: Mon, 9 Dec 2024 15:13:51 +0100 Subject: [PATCH] Update samples for dataset samples 10.2.4 --- .github/workflows/unit-tests.yml | 6 +- doc/multi_table_primer.rst | 15 ++- doc/samples/samples_sklearn.rst | 132 ++++++++------------------- khiops/samples/samples_sklearn.ipynb | 132 ++++++++------------------- khiops/samples/samples_sklearn.py | 122 +++++++++---------------- khiops/tools.py | 2 +- tests/test_estimator_attributes.py | 37 +++----- 7 files changed, 145 insertions(+), 301 deletions(-) diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index f49f177d..2f726fd0 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -1,14 +1,14 @@ --- name: Unit Tests env: - DEFAULT_SAMPLES_REVISION: 10.2.0 + DEFAULT_SAMPLES_REVISION: 10.2.4 DEFAULT_KHIOPS_DESKTOP_REVISION: 10.2.3 on: workflow_dispatch: inputs: samples-revision: - default: 10.2.0 - description: Git tag, branch or commit for the khiops-samples repository + default: 10.2.4 + description: Git Tag/Branch/Commit for the khiops-samples Repo image-tag: default: latest description: Development Docker Image Tag diff --git a/doc/multi_table_primer.rst b/doc/multi_table_primer.rst index 21cc0ab8..df05df4f 100644 --- a/doc/multi_table_primer.rst +++ b/doc/multi_table_primer.rst @@ -135,8 +135,8 @@ schema: We build the input ``X`` as follows:: - accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t", encoding="latin1") - vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Vehicles.txt", sep="\t", encoding="latin1") + accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t") + vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Vehicles.txt", sep="\t") X = { "main_table" : "Accident", "tables": { @@ -164,11 +164,10 @@ through the following *snowflake* schema We build the input ``X`` as follows:: - # We use `Accidents.txt` table of `AccidentsSummary` as it contains the `Gravity` label pre-calculated - accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t", encoding="latin1") - vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Vehicles.txt", sep="\t", encoding="latin1") - users_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Users.txt", sep="\t", encoding="latin1") - places_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Places.txt", sep="\t", encoding="latin1") + accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Accidents.txt", sep="\t") + vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Vehicles.txt", sep="\t") + users_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Users.txt", sep="\t") + places_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Places.txt", sep="\t") X = { "main_table": "Accidents", @@ -176,7 +175,7 @@ We build the input ``X`` as follows:: "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), "Users": (users_df, ["AccidentId", "VehicleId"]), - "Places": (places_df, ["AccidentId"]), + "Places": (places_df, "AccidentId"), }, "relations": [ diff --git a/doc/samples/samples_sklearn.rst b/doc/samples/samples_sklearn.rst index 0ca0320b..f5ce17b8 100644 --- a/doc/samples/samples_sklearn.rst +++ b/doc/samples/samples_sklearn.rst @@ -164,7 +164,6 @@ Samples accidents_df = pd.read_csv( os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", - encoding="latin1", ) vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") @@ -218,33 +217,19 @@ Samples # Load the dataset tables into dataframes accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") - accidents_df = pd.read_csv( - os.path.join(accidents_data_dir, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - users_df = pd.read_csv( - os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" - ) - vehicles_df = pd.read_csv( - os.path.join(accidents_data_dir, "Vehicles.txt"), - sep="\t", - encoding="latin1", - ) - places_df = pd.read_csv( - os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" - ) + accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t") + users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t") + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") + places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t") - # Create the dataset spec - # Note: We discard the "Gravity" column from the "Users" table to avoid a target - # leak. This is because the column was used to build the target. + # Build the multi-table dataset spec (drop the target column "Gravity") X = { "main_table": "Accidents", "tables": { - "Accidents": (accidents_df, "AccidentId"), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), - "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), - "Places": (places_df, ["AccidentId"]), + "Users": (users_df, ["AccidentId", "VehicleId"]), + "Places": (places_df, "AccidentId"), }, "relations": [ ("Accidents", "Vehicles"), @@ -253,15 +238,8 @@ Samples ], } - # Load the target variable "Gravity" from the "AccidentsSummary" dataset - y = pd.read_csv( - os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), - usecols=["Gravity"], - sep="\t", - encoding="latin1", - ).squeeze( - "columns" - ) # squeeze to ensure pandas.Series + # Load the target variable "Gravity" + y = accidents_df["Gravity"] # Split into train and test datasets X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) @@ -406,7 +384,6 @@ Samples accidents_df = pd.read_csv( os.path.join(accidents_dataset_path, "Accidents.txt"), sep="\t", - encoding="latin1", ) # Split the root dataframe into train and test @@ -575,11 +552,10 @@ Samples accidents_df = pd.read_csv( os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", - encoding="latin1", ) vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - # Build the multi-table spec and the target + # Build the multi-table dataset spec (drop the target column "Gravity") X = { "main_table": "Accidents", "tables": { @@ -587,6 +563,8 @@ Samples "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + + # Load the target variable "Gravity" y = accidents_df["Gravity"] # Create the KhiopsEncoder with 5 multitable features and fit it @@ -609,50 +587,29 @@ Samples # Load the tables into dataframes accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") - accidents_df = pd.read_csv( - os.path.join(accidents_data_dir, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - places_df = pd.read_csv( - os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" - ) - users_df = pd.read_csv( - os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" - ) - vehicles_df = pd.read_csv( - os.path.join(accidents_data_dir, "Vehicles.txt"), - sep="\t", - encoding="latin1", - ) + accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t") + users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t") + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") + places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t") - # Build the multi-table spec - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column + # Build the multi-table dataset spec (drop the target column "Gravity") X = { "main_table": "Accidents", "tables": { - "Accidents": (accidents_df, "AccidentId"), - "Places": (places_df, "AccidentId"), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), - "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), + "Users": (users_df, ["AccidentId", "VehicleId"]), + "Places": (places_df, "AccidentId"), }, "relations": [ ("Accidents", "Vehicles"), - ("Accidents", "Places", True), ("Vehicles", "Users"), + ("Accidents", "Places", True), ], } - # Load the target variable from the AccidentsSummary dataset - y = pd.read_csv( - os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), - usecols=["Gravity"], - sep="\t", - encoding="latin1", - ).squeeze( - "columns" - ) # squeeze to ensure pandas.Series + # Load the target variable "Gravity" + y = accidents_df["Gravity"] # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder(n_features=10) @@ -735,33 +692,26 @@ Samples from khiops import core as kh from khiops.sklearn import KhiopsEncoder - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") - accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" - ) + # Load the tables into dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t") + vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t") - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { + # Build the multi-table dataset spec (drop the target column "Gravity") + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, + "relations": [ + ("Accidents", "Vehicles"), + ], } + # Load the target variable "Gravity" + y = accidents_df["Gravity"] + # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder( n_features=20, @@ -777,13 +727,13 @@ Samples transform_type_numerical="part_id", transform_pairs="part_id", ) - khe.fit(X_dataset, y) + khe.fit(X, y) # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) print("Encoded data:") - print(khe.transform(X_dataset)[:10]) + print(khe.transform(X)[:10]) .. autofunction:: khiops_coclustering .. code-block:: python @@ -867,7 +817,6 @@ Samples accidents_df = pd.read_csv( os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", - encoding="latin1", ) X = accidents_df.drop("Gravity", axis=1) y = accidents_df["Gravity"] @@ -932,7 +881,6 @@ Samples accidents_df = pd.read_csv( os.path.join(accidents_dataset_path, "Accidents.txt"), sep="\t", - encoding="latin1", ) # Split the root dataframe into train and test diff --git a/khiops/samples/samples_sklearn.ipynb b/khiops/samples/samples_sklearn.ipynb index 13d5fd30..c79fe141 100644 --- a/khiops/samples/samples_sklearn.ipynb +++ b/khiops/samples/samples_sklearn.ipynb @@ -176,7 +176,6 @@ "accidents_df = pd.read_csv(\n", " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", ")\n", "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", @@ -243,33 +242,19 @@ "\n", "# Load the dataset tables into dataframes\n", "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", - "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ")\n", - "users_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - ")\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ")\n", - "places_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - ")\n", + "accidents_df = pd.read_csv(os.path.join(accidents_data_dir, \"Accidents.txt\"), sep=\"\\t\")\n", + "users_df = pd.read_csv(os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", + "places_df = pd.read_csv(os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\")\n", "\n", - "# Create the dataset spec\n", - "# Note: We discard the \"Gravity\" column from the \"Users\" table to avoid a target\n", - "# leak. This is because the column was used to build the target.\n", + "# Build the multi-table dataset spec (drop the target column \"Gravity\")\n", "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (accidents_df, \"AccidentId\"),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", - " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", - " \"Places\": (places_df, [\"AccidentId\"]),\n", + " \"Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Places\": (places_df, \"AccidentId\"),\n", " },\n", " \"relations\": [\n", " (\"Accidents\", \"Vehicles\"),\n", @@ -278,15 +263,8 @@ " ],\n", "}\n", "\n", - "# Load the target variable \"Gravity\" from the \"AccidentsSummary\" dataset\n", - "y = pd.read_csv(\n", - " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", - " usecols=[\"Gravity\"],\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ").squeeze(\n", - " \"columns\"\n", - ") # squeeze to ensure pandas.Series\n", + "# Load the target variable \"Gravity\"\n", + "y = accidents_df[\"Gravity\"]\n", "\n", "# Split into train and test datasets\n", "X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)\n", @@ -470,7 +448,6 @@ "accidents_df = pd.read_csv(\n", " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", ")\n", "\n", "# Split the root dataframe into train and test\n", @@ -678,11 +655,10 @@ "accidents_df = pd.read_csv(\n", " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", ")\n", "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", - "# Build the multi-table spec and the target\n", + "# Build the multi-table dataset spec (drop the target column \"Gravity\")\n", "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", @@ -690,6 +666,8 @@ " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " },\n", "}\n", + "\n", + "# Load the target variable \"Gravity\"\n", "y = accidents_df[\"Gravity\"]\n", "\n", "# Create the KhiopsEncoder with 5 multitable features and fit it\n", @@ -725,50 +703,29 @@ "\n", "# Load the tables into dataframes\n", "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"Accidents\")\n", - "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ")\n", - "places_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - ")\n", - "users_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\", encoding=\"latin1\"\n", - ")\n", - "vehicles_df = pd.read_csv(\n", - " os.path.join(accidents_data_dir, \"Vehicles.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ")\n", + "accidents_df = pd.read_csv(os.path.join(accidents_data_dir, \"Accidents.txt\"), sep=\"\\t\")\n", + "users_df = pd.read_csv(os.path.join(accidents_data_dir, \"Users.txt\"), sep=\"\\t\")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", + "places_df = pd.read_csv(os.path.join(accidents_data_dir, \"Places.txt\"), sep=\"\\t\")\n", "\n", - "# Build the multi-table spec\n", - "# Note: We discard the \"Gravity\" field from the \"Users\" table as it was used to\n", - "# build the target column\n", + "# Build the multi-table dataset spec (drop the target column \"Gravity\")\n", "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (accidents_df, \"AccidentId\"),\n", - " \"Places\": (places_df, \"AccidentId\"),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", - " \"Users\": (users_df.drop(\"Gravity\", axis=1), [\"AccidentId\", \"VehicleId\"]),\n", + " \"Users\": (users_df, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Places\": (places_df, \"AccidentId\"),\n", " },\n", " \"relations\": [\n", " (\"Accidents\", \"Vehicles\"),\n", - " (\"Accidents\", \"Places\", True),\n", " (\"Vehicles\", \"Users\"),\n", + " (\"Accidents\", \"Places\", True),\n", " ],\n", "}\n", "\n", - "# Load the target variable from the AccidentsSummary dataset\n", - "y = pd.read_csv(\n", - " os.path.join(kh.get_samples_dir(), \"AccidentsSummary\", \"Accidents.txt\"),\n", - " usecols=[\"Gravity\"],\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ").squeeze(\n", - " \"columns\"\n", - ") # squeeze to ensure pandas.Series\n", + "# Load the target variable \"Gravity\"\n", + "y = accidents_df[\"Gravity\"]\n", "\n", "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", "khe = KhiopsEncoder(n_features=10)\n", @@ -877,33 +834,26 @@ "from khiops import core as kh\n", "from khiops.sklearn import KhiopsEncoder\n", "\n", - "# Load the root table of the dataset into a pandas dataframe\n", - "accidents_dataset_path = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", - "accidents_df = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", - " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", - ")\n", - "\n", - "# Obtain the root X feature table and the y target vector (\"Class\" column)\n", - "X_main = accidents_df.drop(\"Gravity\", axis=1)\n", - "y = accidents_df[\"Gravity\"]\n", - "\n", - "# Load the secondary table of the dataset into a pandas dataframe\n", - "X_secondary = pd.read_csv(\n", - " os.path.join(accidents_dataset_path, \"Vehicles.txt\"), sep=\"\\t\"\n", - ")\n", + "# Load the tables into dataframes\n", + "accidents_data_dir = os.path.join(kh.get_samples_dir(), \"AccidentsSummary\")\n", + "accidents_df = pd.read_csv(os.path.join(accidents_data_dir, \"Accidents.txt\"), sep=\"\\t\")\n", + "vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, \"Vehicles.txt\"), sep=\"\\t\")\n", "\n", - "# Create the dataset multitable specification for the train/test split\n", - "# We specify each table with a name and a tuple (dataframe, key_columns)\n", - "X_dataset = {\n", + "# Build the multi-table dataset spec (drop the target column \"Gravity\")\n", + "X = {\n", " \"main_table\": \"Accidents\",\n", " \"tables\": {\n", - " \"Accidents\": (X_main, \"AccidentId\"),\n", - " \"Vehicles\": (X_secondary, [\"AccidentId\", \"VehicleId\"]),\n", + " \"Accidents\": (accidents_df.drop(\"Gravity\", axis=1), \"AccidentId\"),\n", + " \"Vehicles\": (vehicles_df, [\"AccidentId\", \"VehicleId\"]),\n", " },\n", + " \"relations\": [\n", + " (\"Accidents\", \"Vehicles\"),\n", + " ],\n", "}\n", "\n", + "# Load the target variable \"Gravity\"\n", + "y = accidents_df[\"Gravity\"]\n", + "\n", "# Create the KhiopsEncoder with 10 additional multitable features and fit it\n", "khe = KhiopsEncoder(\n", " n_features=20,\n", @@ -919,13 +869,13 @@ " transform_type_numerical=\"part_id\",\n", " transform_pairs=\"part_id\",\n", ")\n", - "khe.fit(X_dataset, y)\n", + "khe.fit(X, y)\n", "\n", "# Transform the train dataset\n", "print(\"Encoded feature names:\")\n", "print(khe.feature_names_out_)\n", "print(\"Encoded data:\")\n", - "print(khe.transform(X_dataset)[:10])" + "print(khe.transform(X)[:10])" ] }, { @@ -1048,7 +998,6 @@ "accidents_df = pd.read_csv(\n", " os.path.join(accidents_data_dir, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", ")\n", "X = accidents_df.drop(\"Gravity\", axis=1)\n", "y = accidents_df[\"Gravity\"]\n", @@ -1126,7 +1075,6 @@ "accidents_df = pd.read_csv(\n", " os.path.join(accidents_dataset_path, \"Accidents.txt\"),\n", " sep=\"\\t\",\n", - " encoding=\"latin1\",\n", ")\n", "\n", "# Split the root dataframe into train and test\n", diff --git a/khiops/samples/samples_sklearn.py b/khiops/samples/samples_sklearn.py index d61cc49e..afca3259 100644 --- a/khiops/samples/samples_sklearn.py +++ b/khiops/samples/samples_sklearn.py @@ -157,7 +157,6 @@ def khiops_classifier_multitable_star(): accidents_df = pd.read_csv( os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", - encoding="latin1", ) vehicles_df = pd.read_csv( os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" @@ -215,32 +214,22 @@ def khiops_classifier_multitable_snowflake(): # Load the dataset tables into dataframes accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_data_dir, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - users_df = pd.read_csv( - os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t" ) + users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t") vehicles_df = pd.read_csv( - os.path.join(accidents_data_dir, "Vehicles.txt"), - sep="\t", - encoding="latin1", - ) - places_df = pd.read_csv( - os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) + places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t") - # Create the dataset spec - # Note: We discard the "Gravity" column from the "Users" table to avoid a target - # leak. This is because the column was used to build the target. + # Build the multi-table dataset spec (drop the target column "Gravity") X = { "main_table": "Accidents", "tables": { - "Accidents": (accidents_df, "AccidentId"), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), - "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), - "Places": (places_df, ["AccidentId"]), + "Users": (users_df, ["AccidentId", "VehicleId"]), + "Places": (places_df, "AccidentId"), }, "relations": [ ("Accidents", "Vehicles"), @@ -249,15 +238,8 @@ def khiops_classifier_multitable_snowflake(): ], } - # Load the target variable "Gravity" from the "AccidentsSummary" dataset - y = pd.read_csv( - os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), - usecols=["Gravity"], - sep="\t", - encoding="latin1", - ).squeeze( - "columns" - ) # squeeze to ensure pandas.Series + # Load the target variable "Gravity" + y = accidents_df["Gravity"] # Split into train and test datasets X_train, X_test, y_train, y_test = train_test_split_dataset(X, y) @@ -407,7 +389,6 @@ def khiops_classifier_with_hyperparameters(): accidents_df = pd.read_csv( os.path.join(accidents_dataset_path, "Accidents.txt"), sep="\t", - encoding="latin1", ) # Split the root dataframe into train and test @@ -590,13 +571,12 @@ def khiops_encoder_multitable_star(): accidents_df = pd.read_csv( os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", - encoding="latin1", ) vehicles_df = pd.read_csv( os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Build the multi-table spec and the target + # Build the multi-table dataset spec (drop the target column "Gravity") X = { "main_table": "Accidents", "tables": { @@ -604,6 +584,8 @@ def khiops_encoder_multitable_star(): "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, } + + # Load the target variable "Gravity" y = accidents_df["Gravity"] # Create the KhiopsEncoder with 5 multitable features and fit it @@ -628,49 +610,32 @@ def khiops_encoder_multitable_snowflake(): # Load the tables into dataframes accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - os.path.join(accidents_data_dir, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - places_df = pd.read_csv( - os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1" - ) - users_df = pd.read_csv( - os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1" + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t" ) + users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t") vehicles_df = pd.read_csv( - os.path.join(accidents_data_dir, "Vehicles.txt"), - sep="\t", - encoding="latin1", + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) + places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t") - # Build the multi-table spec - # Note: We discard the "Gravity" field from the "Users" table as it was used to - # build the target column + # Build the multi-table dataset spec (drop the target column "Gravity") X = { "main_table": "Accidents", "tables": { - "Accidents": (accidents_df, "AccidentId"), - "Places": (places_df, "AccidentId"), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), - "Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]), + "Users": (users_df, ["AccidentId", "VehicleId"]), + "Places": (places_df, "AccidentId"), }, "relations": [ ("Accidents", "Vehicles"), - ("Accidents", "Places", True), ("Vehicles", "Users"), + ("Accidents", "Places", True), ], } - # Load the target variable from the AccidentsSummary dataset - y = pd.read_csv( - os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), - usecols=["Gravity"], - sep="\t", - encoding="latin1", - ).squeeze( - "columns" - ) # squeeze to ensure pandas.Series + # Load the target variable "Gravity" + y = accidents_df["Gravity"] # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder(n_features=10) @@ -761,33 +726,30 @@ def khiops_encoder_with_hyperparameters(): from khiops import core as kh from khiops.sklearn import KhiopsEncoder - # Load the root table of the dataset into a pandas dataframe - accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary") + # Load the tables into dataframes + accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") accidents_df = pd.read_csv( - os.path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", + os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t" ) - - # Obtain the root X feature table and the y target vector ("Class" column) - X_main = accidents_df.drop("Gravity", axis=1) - y = accidents_df["Gravity"] - - # Load the secondary table of the dataset into a pandas dataframe - X_secondary = pd.read_csv( - os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" + vehicles_df = pd.read_csv( + os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t" ) - # Create the dataset multitable specification for the train/test split - # We specify each table with a name and a tuple (dataframe, key_columns) - X_dataset = { + # Build the multi-table dataset spec (drop the target column "Gravity") + X = { "main_table": "Accidents", "tables": { - "Accidents": (X_main, "AccidentId"), - "Vehicles": (X_secondary, ["AccidentId", "VehicleId"]), + "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), }, + "relations": [ + ("Accidents", "Vehicles"), + ], } + # Load the target variable "Gravity" + y = accidents_df["Gravity"] + # Create the KhiopsEncoder with 10 additional multitable features and fit it khe = KhiopsEncoder( n_features=20, @@ -803,13 +765,13 @@ def khiops_encoder_with_hyperparameters(): transform_type_numerical="part_id", transform_pairs="part_id", ) - khe.fit(X_dataset, y) + khe.fit(X, y) # Transform the train dataset print("Encoded feature names:") print(khe.feature_names_out_) print("Encoded data:") - print(khe.transform(X_dataset)[:10]) + print(khe.transform(X)[:10]) # pylint: enable=line-too-long @@ -908,7 +870,6 @@ def khiops_classifier_multitable_list(): accidents_df = pd.read_csv( os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t", - encoding="latin1", ) X = accidents_df.drop("Gravity", axis=1) y = accidents_df["Gravity"] @@ -985,7 +946,6 @@ def khiops_classifier_multitable_star_file(): accidents_df = pd.read_csv( os.path.join(accidents_dataset_path, "Accidents.txt"), sep="\t", - encoding="latin1", ) # Split the root dataframe into train and test diff --git a/khiops/tools.py b/khiops/tools.py index a334147b..a4d27510 100644 --- a/khiops/tools.py +++ b/khiops/tools.py @@ -89,7 +89,7 @@ def kh_samples_entry_point(): # pragma: no cover # Samples version: To be updated when khiops-samples does -DEFAULT_SAMPLES_VERSION = "10.2.0" +DEFAULT_SAMPLES_VERSION = "10.2.4" def kh_download_datasets_entry_point(): diff --git a/tests/test_estimator_attributes.py b/tests/test_estimator_attributes.py index f98d57d6..299d64ff 100644 --- a/tests/test_estimator_attributes.py +++ b/tests/test_estimator_attributes.py @@ -35,39 +35,33 @@ class EstimatorAttributesTests(unittest.TestCase): """ def _create_multitable_input(self, size=None): + # Load `Accidents` into dataframes accidents_dataset_path = path.join(kh.get_samples_dir(), "Accidents") accidents_df = pd.read_csv( - path.join(accidents_dataset_path, "Accidents.txt"), - sep="\t", - encoding="latin1", - ) - users_df = pd.read_csv( - path.join(accidents_dataset_path, "Users.txt"), sep="\t", encoding="latin1" + path.join(accidents_dataset_path, "Accidents.txt"), sep="\t" ) + users_df = pd.read_csv(path.join(accidents_dataset_path, "Users.txt"), sep="\t") vehicles_df = pd.read_csv( - path.join(accidents_dataset_path, "Vehicles.txt"), - sep="\t", - encoding="latin1", + path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t" ) places_df = pd.read_csv( - path.join(accidents_dataset_path, "Places.txt"), - sep="\t", - encoding="latin1", - low_memory=False, + path.join(accidents_dataset_path, "Places.txt"), sep="\t", low_memory=False ) + # Set the sample size if size is None: size = len(accidents_df) + # Create the multi-table dataset spec X = { "main_table": "Accidents", "tables": { - "Accidents": (accidents_df[:size], "AccidentId"), - "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), - "Users": ( - users_df.drop("Gravity", axis=1), - ["AccidentId", "VehicleId"], + "Accidents": ( + accidents_df.drop("Gravity", axis=1)[:size], + "AccidentId", ), + "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]), + "Users": (users_df, ["AccidentId", "VehicleId"]), "Places": (places_df, ["AccidentId"]), }, "relations": [ @@ -76,12 +70,7 @@ def _create_multitable_input(self, size=None): ("Accidents", "Places", True), ], } - - y = pd.read_csv( - path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"), - sep="\t", - encoding="latin1", - )["Gravity"][:size] + y = accidents_df["Gravity"][:size] return X, y