Samples sklearn¶

The code snippets on this page demonstrate the basic use of the khiops.sklearn module.

Script and Jupyter notebook¶

The samples in this page are also available as:

Setup¶

First make sure you have installed the sample datasets. In a configured conda shell (ex. Anaconda Prompt in Windows) execute:

kh-download-datasets

If that doesn’t work open a python console and execute:

from khiops.tools import download_datasets
download_datasets()

Samples¶

samples_sklearn.khiops_classifier()¶: Trains a KhiopsClassifier on a monotable dataframe

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas dataframe
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
adult_df = pd.read_csv(adult_path, sep="\t")

# Split the whole dataframe into train and test (70%-30%)
adult_train_df, adult_test_df = train_test_split(
    adult_df, test_size=0.3, random_state=1
)

# Split the dataset into:
# - the X feature table
# - the y target vector ("class" column)
X_train = adult_train_df.drop("class", axis=1)
X_test = adult_test_df.drop("class", axis=1)
y_train = adult_train_df["class"]
y_test = adult_test_df["class"]

# Create the classifier object
khc = KhiopsClassifier()

# Train the classifier
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
    print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print("---")

# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[0:10])
print("---")

# Predict the class probabilities on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[0:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

# If you have Khiops Visualization installed you may open the report as follows
# khc.export_report_file("report.khj")
# kh.visualize_report("report.khj")

samples_sklearn.khiops_classifier_multiclass()¶: Trains a multiclass KhiopsClassifier on a monotable dataframe

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the dataset into a pandas dataframe
iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt")
iris_df = pd.read_csv(iris_path, sep="\t")

# Split the whole dataframe into train and test (70%-30%)
iris_train_df, iris_test_df = train_test_split(iris_df, test_size=0.3, random_state=1)

# Split the dataset into:
# - the X feature table
# - the y target vector ("Class" column)
X_train = iris_train_df.drop("Class", axis=1)
X_test = iris_test_df.drop("Class", axis=1)
y_train = iris_train_df["Class"]
y_test = iris_test_df["Class"]

# Create the classifier object
khc = KhiopsClassifier()

# Train the classifier
khc.fit(X_train, y_train)

# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[:10])
print("---")

# Predict the class probabilities on the test datasets
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr")
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

samples_sklearn.khiops_classifier_multitable_star()¶: Trains a KhiopsClassifier on a star multi-table dataset

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier, train_test_split_dataset
from sklearn import metrics

# Load the dataset into pandas dataframes
accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
    os.path.join(accidents_data_dir, "Accidents.txt"),
    sep="\t",
)
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")

# Create the dataset spec and the target
X = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
        "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
    },
}
y = accidents_df["Gravity"]

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split_dataset(
    X, y, test_size=0.3, random_state=1
)

# Train the classifier (by default it analyzes 100 multi-table features)
khc = KhiopsClassifier()
khc.fit(X_train, y_train)

# Predict the class on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[:10])
print("---")

# Predict the class probability on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

samples_sklearn.khiops_classifier_multitable_snowflake()¶: Trains a KhiopsClassifier on a snowflake multi-table dataset

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier, train_test_split_dataset
from sklearn import metrics

# Load the dataset tables into dataframes
accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents")
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t")
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
places_df = pd.read_csv(
    os.path.join(accidents_data_dir, "Places.txt"), sep="\t", low_memory=False
)

# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
        "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
        "Users": (users_df, ["AccidentId", "VehicleId"]),
        "Places": (places_df, "AccidentId"),
    },
    "relations": [
        ("Accidents", "Vehicles"),
        ("Vehicles", "Users"),
        ("Accidents", "Places", True),
    ],
}

# Load the target variable "Gravity"
y = accidents_df["Gravity"]

# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split_dataset(X, y)

# Train the classifier (by default it creates 1000 multi-table features)
khc = KhiopsClassifier(n_trees=0)
khc.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khc.n_features_evaluated_}")
print(f"Features selected : {khc.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khc.feature_used_names_[:3]):
    print(f"{feature} - Importance: {khc.feature_used_importances_[i][2]}")
print("---")

# Predict the class on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[:10])
print("---")

# Predict the class probability on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test_pred, y_test)
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

samples_sklearn.khiops_classifier_sparse()¶: Trains a KhiopsClassifier on a monotable sparse matrix

# Imports
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import HashingVectorizer

# Load 3 classes of the 20newsgroups dataset
categories = ["comp.graphics", "sci.space", "misc.forsale"]
data_train, y_train = fetch_20newsgroups(
    subset="train",
    categories=categories,
    return_X_y=True,
)
data_test, y_test = fetch_20newsgroups(
    subset="test",
    categories=categories,
    return_X_y=True,
)

# Extract features from the training data using a sparse vectorizer
vectorizer = HashingVectorizer(n_features=2**10, stop_words="english")
X_train = vectorizer.fit_transform(data_train)

# Extract features from the test data using the same vectorizer
X_test = vectorizer.transform(data_test)

# Create the classifier object
khc = KhiopsClassifier()

# Train the classifier
khc.fit(X_train, y_train)

# Predict the classes on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[0:10])
print("---")

# Predict the class probabilities on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[0:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas, multi_class="ovr")
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

samples_sklearn.khiops_classifier_pickle()¶: Shows the serialization and deserialization of a KhiopsClassifier

# Imports
import os
import pandas as pd
import pickle
from khiops.sklearn import KhiopsClassifier

# Create/clean the output directory
results_dir = os.path.join("kh_samples", "khiops_classifier_pickle")
khc_pickle_path = os.path.join(results_dir, "khiops_classifier.pkl")
if os.path.exists(khc_pickle_path):
    os.remove(khc_pickle_path)
else:
    os.makedirs(results_dir, exist_ok=True)

# Load the "Iris" dataset
iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt")
iris_df = pd.read_csv(iris_path, sep="\t")
X = iris_df.drop("Class", axis=1)
y = iris_df["Class"]

# Train the model with the Iris dataset
khc = KhiopsClassifier()
khc.fit(X, y)

# Pickle its content to a file
with open(khc_pickle_path, "wb") as khc_pickle_output_file:
    pickle.dump(khc, khc_pickle_output_file)

# Unpickle it
with open(khc_pickle_path, "rb") as khc_pickle_file:
    new_khc = pickle.load(khc_pickle_file)

# Make some predictions on the training dataset with the unpickled classifier
new_khc.predict(X)
y_predicted = new_khc.predict(X)
print("Predicted classes (first 10):")
print(y_predicted[:10])
print("---")

samples_sklearn.khiops_classifier_with_hyperparameters()¶: Trains a KhiopsClassifier on a star multi-table dataset (advanced version with more hyperparameters)

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the root table of the dataset into a pandas dataframe
accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
    os.path.join(accidents_dataset_path, "Accidents.txt"),
    sep="\t",
)

# Split the root dataframe into train and test
accidents_train_df, accidents_test_df = train_test_split(
    accidents_df, test_size=0.3, random_state=1
)

# Obtain the main X feature table and the y target vector ("Class" column)
y_train = accidents_train_df["Gravity"]
y_test = accidents_test_df["Gravity"]
X_train_main = accidents_train_df.drop("Gravity", axis=1)
X_test_main = accidents_test_df.drop("Gravity", axis=1)

# Load the secondary table of the dataset into a pandas dataframe
vehicles_df = pd.read_csv(
    os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train_main["AccidentId"].to_frame()
X_test_ids = X_test_main["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId")

# Create the dataset multitable specification for the train/test split
# We specify each table with a name and a tuple (dataframe, key_columns)
X_train = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (X_train_main, "AccidentId"),
        "Vehicles": (X_train_secondary, ["AccidentId", "VehicleId"]),
    },
}
X_test = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (X_test_main, "AccidentId"),
        "Vehicles": (X_test_secondary, ["AccidentId", "VehicleId"]),
    },
}
# Train the classifier (by default it analyzes 100 multi-table features)
khc = KhiopsClassifier(
    n_features=20,
    n_pairs=5,
    n_trees=5,
    n_selected_features=10,
    n_evaluated_features=15,
    specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")],
    all_possible_pairs=True,
    construction_rules=["TableMode", "TableSelection"],
    group_target_value=False,
)
khc.fit(X_train, y_train)

# Predict the class on the test dataset
y_test_pred = khc.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[:10])
print("---")

# Predict the class probability on the test dataset
y_test_probas = khc.predict_proba(X_test)
print(f"Class order: {khc.classes_}")
print("Predicted class probabilities (first 10):")
print(y_test_probas[:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

samples_sklearn.khiops_regressor()¶: Trains a KhiopsRegressor on a monotable dataframe

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the "Adult" dataset and set the target to the "age" column
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
adult_df = pd.read_csv(adult_path, sep="\t")
X = adult_df.drop("age", axis=1)
y = adult_df["age"]

# Split the whole dataframe into train and test (40%-60% for speed)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

# Create the regressor object
khr = KhiopsRegressor()

# Train the regressor
khr.fit(X_train, y_train)

# Show the feature importance info
print(f"Features evaluated: {khr.n_features_evaluated_}")
print(f"Features selected : {khr.n_features_used_}")
print("Top 3 used features")
for i, feature in enumerate(khr.feature_used_names_[:3]):
    print(f"{feature} - Importance: {khr.feature_used_importances_[i][2]}")
print("---")

# Predict the values on the test dataset
y_test_pred = khr.predict(X_test)
print("Predicted values for 'age' (first 10):")
print(y_test_pred[:10])
print("---")

# Evaluate R2 and MAE metrics on the test dataset
test_r2 = metrics.r2_score(y_test, y_test_pred)
test_mae = metrics.mean_absolute_error(y_test, y_test_pred)
print(f"Test R2  = {test_r2}")
print(f"Test MAE = {test_mae}")

# If you have Khiops Visualization installed you may open the report as follows
# khr.export_report_file("report.khj")
# kh.visualize_report("report.khj")

samples_sklearn.khiops_encoder()¶

Trains a KhiopsEncoder on a monotable dataframe

The Khiops encoder is a supervised feature encoder. It discretizes numerical features and groups categorical features in a way that the resulting interval/groups have the highest class-purity.

Note

For simplicity we train from the whole dataset. To assess the performance one usually splits the dataset into train and test subsets.

# Imports
import pandas as pd
from khiops.sklearn import KhiopsEncoder

# Load the dataset
iris_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt")
iris_df = pd.read_csv(iris_path, sep="\t")
X = iris_df.drop("Class", axis=1)
y = iris_df["Class"]

# Create the encoder object
khe = KhiopsEncoder(transform_type_numerical="part_label")
khe.fit(X, y)

# Transform the training dataset
X_transformed = khe.transform(X)

# Print both the original and transformed features
print("Original:")
print(X[:10])
print("---")
print("Encoded feature names:")
print(khe.feature_names_out_)
print("Encoded data:")
print(X_transformed[:10])
print("---")

# If you have Khiops Visualization installed you may open the report as follows
# khe.export_report_file("report.khj")
# kh.visualize_report("report.khj")

samples_sklearn.khiops_encoder_multitable_star()¶: Trains a KhiopsEncoder on a star multi-table dataset

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsEncoder

# Load the dataset tables into dataframe
accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
    os.path.join(accidents_data_dir, "Accidents.txt"),
    sep="\t",
)
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")

# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
        "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
    },
}

# Load the target variable "Gravity"
y = accidents_df["Gravity"]

# Create the KhiopsEncoder with 5 multitable features and fit it
khe = KhiopsEncoder(n_features=10)
khe.fit(X, y)

# Transform the train dataset
print("Encoded feature names:")
print(khe.feature_names_out_)
print("Encoded data:")
print(khe.transform(X)[:10])

samples_sklearn.khiops_encoder_multitable_snowflake()¶: Trains a KhiopsEncoder on a snowflake multi-table dataset

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsEncoder

# Load the tables into dataframes
accidents_data_dir = os.path.join(kh.get_samples_dir(), "Accidents")
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
users_df = pd.read_csv(os.path.join(accidents_data_dir, "Users.txt"), sep="\t")
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")
places_df = pd.read_csv(
    os.path.join(accidents_data_dir, "Places.txt"), sep="\t", low_memory=False
)

# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
        "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
        "Users": (users_df, ["AccidentId", "VehicleId"]),
        "Places": (places_df, "AccidentId"),
    },
    "relations": [
        ("Accidents", "Vehicles"),
        ("Vehicles", "Users"),
        ("Accidents", "Places", True),
    ],
}

# Load the target variable "Gravity"
y = accidents_df["Gravity"]

# Create the KhiopsEncoder with 10 additional multitable features and fit it
khe = KhiopsEncoder(n_features=10)
khe.fit(X, y)

# Show the feature importance info
print(f"Features evaluated: {khe.n_features_evaluated_}")
print("Top 3 evaluated features")
for i, feature in enumerate(khe.feature_evaluated_names_[:3]):
    print(f"{feature} - Level: {khe.feature_evaluated_importances_[i]}")
print("---")

# Transform the train dataset
print("Encoded feature names:")
print(khe.feature_names_out_)
print("Encoded data:")
print(khe.transform(X)[:10])

samples_sklearn.khiops_encoder_pipeline_with_hgbc()¶: Uses a KhiopsEncoder with a HistGradientBoostingClassifier

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsEncoder
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

# Load the dataset into dataframes
adult_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt")
adult_df = pd.read_csv(adult_path, sep="\t")
X = adult_df.drop("class", axis=1)
y = adult_df["class"]

# Split the dataset into train and test (70%-30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create the pipeline and fit it. Steps:
# - The khiops supervised column encoder, generates a full-categorical table
# - One hot encoder in all columns
# - Train the HGB classifier
pipe_steps = [
    ("khiops_enc", KhiopsEncoder()),
    (
        "onehot_enc",
        ColumnTransformer([], remainder=OneHotEncoder(sparse_output=False)),
    ),
    ("hgb_clf", HistGradientBoostingClassifier()),
]
pipe = Pipeline(pipe_steps)
pipe.fit(X_train, y_train)

# Predict the classes on the test dataset
y_test_pred = pipe.predict(X_test)
print("Predicted classes (first 10):")
print(y_test_pred[:10])
print("---")

# Predict the class probabilities on the test dataset
y_test_probas = pipe.predict_proba(X_test)
print("Predicted class probabilities (first 10):")
print(y_test_probas[:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

samples_sklearn.khiops_encoder_with_hyperparameters()¶: Trains a KhiopsEncoder on a star multi-table dataset (advanced version with more hyperparameters)

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsEncoder

# Load the tables into dataframes
accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(os.path.join(accidents_data_dir, "Accidents.txt"), sep="\t")
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")

# Build the multi-table dataset spec (drop the target column "Gravity")
X = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (accidents_df.drop("Gravity", axis=1), "AccidentId"),
        "Vehicles": (vehicles_df, ["AccidentId", "VehicleId"]),
    },
    "relations": [
        ("Accidents", "Vehicles"),
    ],
}

# Load the target variable "Gravity"
y = accidents_df["Gravity"]

# Create the KhiopsEncoder with 10 additional multitable features and fit it
khe = KhiopsEncoder(
    n_features=20,
    n_pairs=5,
    n_trees=5,
    specific_pairs=[("Light", "Weather"), ("Light", "IntersectionType")],
    all_possible_pairs=True,
    construction_rules=["TableMode", "TableSelection"],
    group_target_value=False,
    informative_features_only=True,
    keep_initial_variables=True,
    transform_type_categorical="part_id",
    transform_type_numerical="part_id",
    transform_type_pairs="part_id",
)
khe.fit(X, y)

# Transform the train dataset
print("Encoded feature names:")
print(khe.feature_names_out_)
print("Encoded data:")
print(khe.transform(X)[:10])

samples_sklearn.khiops_coclustering()¶: Trains a KhiopsCoclustering on a dataframe

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsCoclustering
from sklearn.model_selection import train_test_split

# Load the secondary table of the dataset into a pandas dataframe
splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
splice_dna_df = pd.read_csv(
    os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t"
)

# Train with only 70% of data (for speed in this example)
X, _ = train_test_split(splice_dna_df, test_size=0.3, random_state=1)

# Create the KhiopsCoclustering instance
khcc = KhiopsCoclustering()

# Train the model with the whole dataset
khcc.fit(X, id_column="SampleId")

# Predict the clusters in some instances
X_clusters = khcc.predict(X)
print("Predicted clusters (first 10)")
print(X_clusters[:10])
print("---")

# If you have Khiops Co-Visualization installed you may open the report as follows
# khcc.export_report_file("report.khcj")
# kh.visualize_report("report.khcj")

samples_sklearn.khiops_coclustering_simplify()¶: Simplifies a KhiopsCoclustering already trained on a dataframe

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsCoclustering
from sklearn.model_selection import train_test_split

# Load the secondary table of the dataset into a pandas dataframe
splice_data_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction")
splice_dna_X = pd.read_csv(
    os.path.join(splice_data_dir, "SpliceJunctionDNA.txt"), sep="\t"
)

# Train with only 70% of data (for speed in this example)
X, _ = train_test_split(splice_dna_X, test_size=0.3, random_state=1)

# Create the KhiopsCoclustering instance
khcc = KhiopsCoclustering()

# Train the model with the whole dataset
khcc.fit(X, id_column="SampleId")

# Simplify coclustering along the individual ID dimension
simplified_khcc = khcc.simplify(max_part_numbers={"SampleId": 3})

# Predict the clusters using the simplified model
X_clusters = simplified_khcc.predict(X)
print("Predicted clusters (only three at most)")
print(X_clusters)
print("---")

samples_sklearn.khiops_classifier_multitable_list()¶: Trains a KhiopsClassifier using a list dataset specification

Warning

This dataset input method is Deprecated and will be removed in Khiops 11.

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Load the root table of the dataset into a pandas dataframe
accidents_data_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
    os.path.join(accidents_data_dir, "Accidents.txt"),
    sep="\t",
)
X = accidents_df.drop("Gravity", axis=1)
y = accidents_df["Gravity"]

# Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Load the secondary table of the dataset into a pandas dataframe
vehicles_df = pd.read_csv(os.path.join(accidents_data_dir, "Vehicles.txt"), sep="\t")

# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train["AccidentId"].to_frame()
X_test_ids = X_test["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId")

# Create the classifier specifying the key column name
khc = KhiopsClassifier(key="AccidentId")

# Train the classifier
khc.fit([X_train, X_train_secondary], y_train)

# Predict the class on the test dataset
y_test_pred = khc.predict([X_test, X_test_secondary])
print("Predicted classes (first 10):")
print(y_test_pred[:10])
print("---")

# Predict the class probability on the test dataset
y_test_probas = khc.predict_proba([X_test, X_test_secondary])
print("Predicted class probabilities (first 10):")
print(y_test_probas[:10])
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred)
test_auc = metrics.roc_auc_score(y_test, y_test_probas[:, 1])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")

samples_sklearn.khiops_classifier_multitable_star_file()¶: Trains a KhiopsClassifier with a file path based dataset

Warning

This dataset input method is Deprecated and will be removed in Khiops 11. If you need to handle large datasets that do not easily fit into memory then you may use the core API directly, which allows to specify file paths directly.

# Imports
import os
import pandas as pd
from khiops import core as kh
from khiops.sklearn import KhiopsClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Create output directory
results_dir = os.path.join("kh_samples", "khiops_classifier_multitable_star_file")
if not os.path.exists("kh_samples"):
    os.mkdir("kh_samples")
    os.mkdir(results_dir)
else:
    if not os.path.exists(results_dir):
        os.mkdir(results_dir)

# Load the root table of the dataset into a pandas dataframe
accidents_dataset_path = os.path.join(kh.get_samples_dir(), "AccidentsSummary")
accidents_df = pd.read_csv(
    os.path.join(accidents_dataset_path, "Accidents.txt"),
    sep="\t",
)

# Split the root dataframe into train and test
X_train_main, X_test_main = train_test_split(
    accidents_df, test_size=0.3, random_state=1
)

# Load the secondary table of the dataset into a pandas dataframe
vehicles_df = pd.read_csv(
    os.path.join(accidents_dataset_path, "Vehicles.txt"), sep="\t"
)

# Split the secondary dataframe with the keys of the split root dataframe
X_train_ids = X_train_main["AccidentId"].to_frame()
X_test_ids = X_test_main["AccidentId"].to_frame()
X_train_secondary = X_train_ids.merge(vehicles_df, on="AccidentId")
X_test_secondary = X_test_ids.merge(vehicles_df, on="AccidentId")

# Write the train and test dataset sets to disk
# For the test file we remove the target column from the main table
X_train_main_path = os.path.join(results_dir, "X_train_main.txt")
X_train_main.to_csv(X_train_main_path, sep="\t", header=True, index=False)
X_train_secondary_path = os.path.join(results_dir, "X_train_secondary.txt")
X_train_secondary.to_csv(X_train_secondary_path, sep="\t", header=True, index=False)
X_test_main_path = os.path.join(results_dir, "X_test_main.txt")
y_test = X_test_main.sort_values("AccidentId")["Gravity"]
X_test_main.drop(columns="Gravity").to_csv(
    X_test_main_path, sep="\t", header=True, index=False
)
X_test_secondary_path = os.path.join(results_dir, "X_test_secondary.txt")
X_test_secondary.to_csv(X_test_secondary_path, sep="\t", header=True, index=False)

# Define the dictionary of train
X_train = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (X_train_main_path, "AccidentId"),
        "Vehicles": (X_train_secondary_path, ["AccidentId", "VehicleId"]),
    },
    "format": ("\t", True),
}
X_test = {
    "main_table": "Accidents",
    "tables": {
        "Accidents": (X_test_main_path, "AccidentId"),
        "Vehicles": (X_test_secondary_path, ["AccidentId", "VehicleId"]),
    },
    "format": ("\t", True),
}

# Create the classifier and fit it
khc = KhiopsClassifier(output_dir=results_dir)
khc.fit(X_train, y="Gravity")

# Predict the class in addition to the class probabilities on the test dataset
y_test_pred_path = khc.predict(X_test)
y_test_pred = pd.read_csv(y_test_pred_path, sep="\t")
print("Predicted classes (first 10):")
print(y_test_pred["PredictedGravity"].head(10))
print("---")

y_test_probas_path = khc.predict_proba(X_test)
y_test_probas = pd.read_csv(y_test_probas_path, sep="\t")
proba_columns = [col for col in y_test_probas if col.startswith("Prob")]
print("Predicted class probabilities (first 10):")
print(y_test_probas[proba_columns].head(10))
print("---")

# Evaluate accuracy and auc metrics on the test dataset
test_accuracy = metrics.accuracy_score(y_test, y_test_pred["PredictedGravity"])
test_auc = metrics.roc_auc_score(y_test, y_test_probas["ProbGravityLethal"])
print(f"Test accuracy = {test_accuracy}")
print(f"Test auc      = {test_auc}")