Skip to content

Commit

Permalink
Merge pull request #303 from KhiopsML/update-samples
Browse files Browse the repository at this point in the history
Update samples for dataset samples 10.2.4
  • Loading branch information
folmos-at-orange authored Dec 11, 2024
2 parents df90b0e + 3c47c76 commit 20f8d48
Show file tree
Hide file tree
Showing 8 changed files with 146 additions and 302 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
---
name: Unit Tests
env:
DEFAULT_SAMPLES_REVISION: 10.2.0
DEFAULT_SAMPLES_REVISION: 10.2.4
DEFAULT_KHIOPS_DESKTOP_REVISION: 10.2.3
on:
workflow_dispatch:
inputs:
samples-revision:
default: 10.2.0
description: Git tag, branch or commit for the khiops-samples repository
default: 10.2.4
description: Git Tag/Branch/Commit for the khiops-samples Repo
image-tag:
default: latest
description: Development Docker Image Tag
Expand Down
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
sys.path.append("../khiops/samples")
import khiops

project = "Khiops"
project = "Khiops Python"
copyright = f"2018-{datetime.today().year}, Orange"
author = "The Khiops Team"

Expand Down
15 changes: 7 additions & 8 deletions doc/multi_table_primer.rst
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,8 @@ schema:
We build the input ``X`` as follows::

accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t", encoding="latin1")
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Vehicles.txt", sep="\t", encoding="latin1")
accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t")
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Vehicles.txt", sep="\t")
X = {
"main_table" : "Accident",
"tables": {
Expand Down Expand Up @@ -164,19 +164,18 @@ through the following *snowflake* schema
We build the input ``X`` as follows::

# We use `Accidents.txt` table of `AccidentsSummary` as it contains the `Gravity` label pre-calculated
accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/AccidentsSummary/Accidents.txt", sep="\t", encoding="latin1")
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Vehicles.txt", sep="\t", encoding="latin1")
users_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Users.txt", sep="\t", encoding="latin1")
places_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Places.txt", sep="\t", encoding="latin1")
accidents_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Accidents.txt", sep="\t")
vehicles_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Vehicles.txt", sep="\t")
users_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Users.txt", sep="\t")
places_df = pd.read_csv(f"{kh.get_samples_dir()}/Accidents/Places.txt", sep="\t")

X = {
"main_table": "Accidents",
"tables": {
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
"Users": (users_df, ["AccidentId", "VehicleId"]),
"Places": (places_df, ["AccidentId"]),
"Places": (places_df, "AccidentId"),

},
"relations": [
Expand Down
132 changes: 40 additions & 92 deletions doc/samples/samples_sklearn.rst
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ Samples
accidents_df = pd.read_csv(
os.path.join(accidents_data_dir, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
Expand Down Expand Up @@ -218,33 +217,19 @@ Samples
# Load the dataset tables into dataframes
accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents")
accidents_df = pd.read_csv(
os.path.join(accidents_data_dir, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
users_df = pd.read_csv(
os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1"
)
vehicles_df = pd.read_csv(
os.path.join(accidents_data_dir, "Vehicles.txt"),
sep="\t",
encoding="latin1",
)
places_df = pd.read_csv(
os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1"
)
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t")
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t")
# Create the dataset spec
# Note: We discard the "Gravity" column from the "Users" table to avoid a target
# leak. This is because the column was used to build the target.
# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
"main_table": "Accidents",
"tables": {
"Accidents": (accidents_df, "AccidentId"),
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
"Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]),
"Places": (places_df, ["AccidentId"]),
"Users": (users_df, ["AccidentId", "VehicleId"]),
"Places": (places_df, "AccidentId"),
},
"relations": [
("Accidents", "Vehicles"),
Expand All @@ -253,15 +238,8 @@ Samples
],
}
# Load the target variable "Gravity" from the "AccidentsSummary" dataset
y = pd.read_csv(
os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"),
usecols=["Gravity"],
sep="\t",
encoding="latin1",
).squeeze(
"columns"
) # squeeze to ensure pandas.Series
# Load the target variable "Gravity"
y = accidents_df["Gravity"]
# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)
Expand Down Expand Up @@ -406,7 +384,6 @@ Samples
accidents_df = pd.read_csv(
os.path.join(accidents_dataset_path, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
# Split the root dataframe into train and test
Expand Down Expand Up @@ -575,18 +552,19 @@ Samples
accidents_df = pd.read_csv(
os.path.join(accidents_data_dir, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
# Build the multi-table spec and the target
# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
"main_table": "Accidents",
"tables": {
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
},
}
# Load the target variable "Gravity"
y = accidents_df["Gravity"]
# Create the KhiopsEncoder with 5 multitable features and fit it
Expand All @@ -609,50 +587,29 @@ Samples
# Load the tables into dataframes
accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents")
accidents_df = pd.read_csv(
os.path.join(accidents_data_dir, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
places_df = pd.read_csv(
os.path.join(accidents_data_dir, "Places.txt"), sep="\t", encoding="latin1"
)
users_df = pd.read_csv(
os.path.join(accidents_data_dir, "Users.txt"), sep="\t", encoding="latin1"
)
vehicles_df = pd.read_csv(
os.path.join(accidents_data_dir, "Vehicles.txt"),
sep="\t",
encoding="latin1",
)
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t")
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
places_df = pd.read_csv(os.path.join(accidents_data_dir, "Places.txt"), sep="\t")
# Build the multi-table spec
# Note: We discard the "Gravity" field from the "Users" table as it was used to
# build the target column
# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
"main_table": "Accidents",
"tables": {
"Accidents": (accidents_df, "AccidentId"),
"Places": (places_df, "AccidentId"),
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
"Users": (users_df.drop("Gravity", axis=1), ["AccidentId", "VehicleId"]),
"Users": (users_df, ["AccidentId", "VehicleId"]),
"Places": (places_df, "AccidentId"),
},
"relations": [
("Accidents", "Vehicles"),
("Accidents", "Places", True),
("Vehicles", "Users"),
("Accidents", "Places", True),
],
}
# Load the target variable from the AccidentsSummary dataset
y = pd.read_csv(
os.path.join(kh.get_samples_dir(), "AccidentsSummary", "Accidents.txt"),
usecols=["Gravity"],
sep="\t",
encoding="latin1",
).squeeze(
"columns"
) # squeeze to ensure pandas.Series
# Load the target variable "Gravity"
y = accidents_df["Gravity"]
# Create the KhiopsEncoder with 10 additional multitable features and fit it
khe = KhiopsEncoder(n_features=10)
Expand Down Expand Up @@ -735,33 +692,26 @@ Samples
from khiops import core as kh
from khiops.sklearn import KhiopsEncoder
# Load the root table of the dataset into a pandas dataframe
accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
os.path.join(accidents_dataset_path, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
# Obtain the root X feature table and the y target vector ("Class" column)
X_main = accidents_df.drop("Gravity", axis=1)
y = accidents_df["Gravity"]
# Load the secondary table of the dataset into a pandas dataframe
X_secondary = pd.read_csv(
os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)
# Load the tables into dataframes
accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
# Create the dataset multitable specification for the train/test split
# We specify each table with a name and a tuple (dataframe, key_columns)
X_dataset = {
# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
"main_table": "Accidents",
"tables": {
"Accidents": (X_main, "AccidentId"),
"Vehicles": (X_secondary, ["AccidentId", "VehicleId"]),
"Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
"Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
},
"relations": [
("Accidents", "Vehicles"),
],
}
# Load the target variable "Gravity"
y = accidents_df["Gravity"]
# Create the KhiopsEncoder with 10 additional multitable features and fit it
khe = KhiopsEncoder(
n_features=20,
Expand All @@ -777,13 +727,13 @@ Samples
transform_type_numerical="part_id",
transform_pairs="part_id",
)
khe.fit(X_dataset, y)
khe.fit(X, y)
# Transform the train dataset
print("Encoded feature names:")
print(khe.feature_names_out_)
print("Encoded data:")
print(khe.transform(X_dataset)[:10])
print(khe.transform(X)[:10])
.. autofunction:: khiops_coclustering
.. code-block:: python
Expand Down Expand Up @@ -867,7 +817,6 @@ Samples
accidents_df = pd.read_csv(
os.path.join(accidents_data_dir, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
X = accidents_df.drop("Gravity", axis=1)
y = accidents_df["Gravity"]
Expand Down Expand Up @@ -932,7 +881,6 @@ Samples
accidents_df = pd.read_csv(
os.path.join(accidents_dataset_path, "Accidents.txt"),
sep="\t",
encoding="latin1",
)
# Split the root dataframe into train and test
Expand Down
Loading

0 comments on commit 20f8d48

Please sign in to comment.