:orphan: .. currentmodule:: samples Samples core ============ The code snippets on this page demonstrate the basic use of the :py:mod:`khiops.core` module. Script and Jupyter notebook --------------------------- The samples in this page are also available as: - :download:`Python script <../../khiops/samples/samples.py>` - :download:`Jupyter notebook <../../khiops/samples/samples.ipynb>` Setup ----- First make sure you have installed the sample datasets. In a configured conda shell (ex. *Anaconda Prompt* in Windows) execute: .. code-block:: shell kh-download-datasets If that doesn't work open a python console and execute: .. code-block:: python from khiops.tools import download_datasets download_datasets() Samples ------- .. autofunction:: get_khiops_version .. code-block:: python print(f"Khiops version: {kh.get_khiops_version()}") .. autofunction:: build_dictionary_from_data_table .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") dictionary_name = "AutoAdult" dictionary_file_path = os.path.join( "kh_samples", "build_dictionary_from_data_table", "AutoAdult.kdic" ) # Create the dictionary from the data table kh.build_dictionary_from_data_table( data_table_path, dictionary_name, dictionary_file_path ) .. autofunction:: detect_data_table_format .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") results_dir = os.path.join("kh_samples", "detect_data_table_format") transformed_data_table_path = os.path.join(results_dir, "AdultWithAnotherFormat.txt") # Create the output directory if not os.path.isdir(results_dir): os.mkdir(results_dir) # Detect the format of the table format_spec = kh.detect_data_table_format(data_table_path) print("Format specification (header_line, field_separator)") print("Format detected on original table:", format_spec) # Make a deployment to change the format of the data table kh.deploy_model( dictionary_file_path, "Adult", data_table_path, transformed_data_table_path, output_header_line=False, output_field_separator=",", ) # Detect the new format of the table without a dictionary file format_spec = kh.detect_data_table_format(transformed_data_table_path) print("Format detected on reformatted table:", format_spec) # Detect the new format of the table with a dictionary file format_spec = kh.detect_data_table_format( transformed_data_table_path, dictionary_file_path_or_domain=dictionary_file_path, dictionary_name="Adult", ) print("Format detected (with dictionary file) on reformatted table:", format_spec) .. autofunction:: check_database .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") log_file = os.path.join("kh_samples", "check_database", "check_database.log") # Check the database kh.check_database( dictionary_file_path, "Adult", data_table_path, log_file_path=log_file, max_messages=50, ) .. autofunction:: export_dictionary_files .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") results_dir = os.path.join("kh_samples", "export_dictionary_file") output_dictionary_file_path = os.path.join(results_dir, "ModifiedAdult.kdic") output_dictionary_json_path = os.path.join(results_dir, "ModifiedAdult.kdicj") alt_output_dictionary_json_path = os.path.join(results_dir, "AltModifiedAdult.kdicj") # Load the dictionary domain from initial dictionary file # Then obtain the "Adult" dictionary within domain = kh.read_dictionary_file(dictionary_file_path) dictionary = domain.get_dictionary("Adult") # Set some of its variables to unused fnlwgt_variable = dictionary.get_variable("fnlwgt") fnlwgt_variable.used = False label_variable = dictionary.get_variable("Label") label_variable.used = False # Create output directory if necessary if not os.path.exists("kh_samples"): os.mkdir("kh_samples") os.mkdir(results_dir) else: if not os.path.exists(results_dir): os.mkdir(results_dir) # Export to kdic domain.export_khiops_dictionary_file(output_dictionary_file_path) # Export to kdicj either from the domain or from a kdic file # Requires a Khiops execution, that's why it is not a method of DictionaryDomain kh.export_dictionary_as_json(domain, output_dictionary_json_path) kh.export_dictionary_as_json( output_dictionary_file_path, alt_output_dictionary_json_path ) .. autofunction:: train_predictor .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_predictor") # Train the predictor kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, ) .. autofunction:: train_predictor_file_paths .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_predictor_file_paths") # Train the predictor report_file_path, modeling_dictionary_file_path = kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, ) print("Reports file available at " + report_file_path) print("Modeling dictionary file available at " + modeling_dictionary_file_path) # If you have Khiops Visualization installed you may open the report as follows # kh.visualize_report(report_file_path) .. autofunction:: train_predictor_error_handling .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths with a nonexistent dictionary file dictionary_file_path = "NONEXISTENT_DICTIONARY_FILE.kdic" data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_predictor_error_handling") log_file_path = os.path.join(results_dir, "khiops.log") scenario_path = os.path.join(results_dir, "scenario._kh") # Train the predictor and handle the error try: kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, trace=True, log_file_path=log_file_path, output_scenario_path=scenario_path, ) except kh.KhiopsRuntimeError as error: print("Khiops training failed! Below the KhiopsRuntimeError message:") print(error) print("\nFull log contents:") print("------------------") with open(log_file_path) as log_file: for line in log_file: print(line, end="") print("\nExecuted scenario") print("-----------------") with open(scenario_path) as scenario_file: for line in scenario_file: print(line, end="") .. autofunction:: train_predictor_mt .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") results_dir = os.path.join("kh_samples", "train_predictor_mt") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables # - To not construct any decision tree # The default number of automatic features is 100 kh.train_predictor( dictionary_file_path, "Accident", accidents_table_path, "Gravity", results_dir, additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_trees=0, ) .. autofunction:: train_predictor_mt_with_specific_rules .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") results_dir = os.path.join("kh_samples", "train_predictor_mt_with_specific_rules") # Train the predictor. Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables # - The maximum number of aggregate variables to construct (1000) # - The construction rules allowed to automatically create aggregates # - To not construct any decision tree kh.train_predictor( dictionary_file_path, "Accident", accidents_table_path, "Gravity", results_dir, additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_constructed_variables=1000, construction_rules=["TableMode", "TableSelection"], max_trees=0, ) .. autofunction:: train_predictor_mt_snowflake .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") users_table_path = os.path.join(accidents_dir, "Users.txt") places_table_path = os.path.join(accidents_dir, "Places.txt") results_dir = os.path.join("kh_samples", "train_predictor_mt_snowflake") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables # - To not construct any decision tree # The default number of automatic features is 100 kh.train_predictor( dictionary_file_path, "Accident", accidents_table_path, "Gravity", results_dir, additional_data_tables={ "Accident`Vehicles": vehicles_table_path, "Accident`Vehicles`Users": users_table_path, "Accident`Place": places_table_path, }, max_trees=0, ) .. autofunction:: train_predictor_with_train_percentage .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_predictor_with_train_percentage") # Train the predictor. Besides the mandatory parameters, it is specified: # - A 90% sampling rate for the training dataset # - Set the test dataset as the complement of the training dataset (10%) # - No trees kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, sample_percentage=90, use_complement_as_test=True, max_trees=0, results_prefix="P90_", ) .. autofunction:: train_predictor_with_trees .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Letter", "Letter.txt") results_dir = os.path.join("kh_samples", "train_predictor_with_trees") # Train the predictor with at most 15 trees (default 10) kh.train_predictor( dictionary_file_path, "Letter", data_table_path, "lettr", results_dir, sample_percentage=80, use_complement_as_test=True, results_prefix="P80_", max_trees=15, ) .. autofunction:: train_predictor_with_pairs .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_predictor_with_pairs") # Train the predictor with at most 10 pairs as follows: # - Include pairs age-race and capital_gain-capital_loss # - Include all possible pairs having relationship as component kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, use_complement_as_test=True, max_trees=0, max_pairs=10, specific_pairs=[ ("age", "race"), ("capital_gain", "capital_loss"), ("relationship", ""), ], ) .. autofunction:: train_predictor_with_multiple_parameters .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_predictor_with_multiple_parameters") output_script_path = os.path.join(results_dir, "output_scenario._kh") log_path = os.path.join(results_dir, "log.txt") # Train the predictor. Besides the mandatory parameters, we specify: # - The value "more" as main target value # - The output Khiops script file location (generic) # - The log file location (generic) # - The maximum memory used, set to 1000 MB # - To show the debug trace (generic) kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, main_target_value="more", output_scenario_path=output_script_path, log_file_path=log_path, memory_limit_mb=1000, trace=True, ) .. autofunction:: train_predictor_detect_format .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") results_dir = os.path.join("kh_samples", "train_predictor_detect_format") transformed_data_table_path = os.path.join(results_dir, "TransformedIris.txt") # Transform the database format from header_line=True and field_separator=TAB # to header_line=False and field_separator="," # See the deploy_model examples below for more details kh.deploy_model( dictionary_file_path, "Iris", data_table_path, transformed_data_table_path, output_header_line=False, output_field_separator=",", ) # Try to learn with the old format try: kh.train_predictor( dictionary_file_path, "Iris", transformed_data_table_path, "Class", results_dir, header_line=True, field_separator="", ) except kh.KhiopsRuntimeError as error: print( "This failed because of a bad data table format spec. " + "Below the KhiopsRuntimeError message" ) print(error) # Train without specifyng the format (detect_format is True by default) kh.train_predictor( dictionary_file_path, "Iris", transformed_data_table_path, "Class", results_dir, ) .. autofunction:: train_predictor_with_cross_validation .. code-block:: python # Imports import math import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_predictor_with_cross_validation") fold_dictionary_file_path = os.path.join(results_dir, "AdultWithFolding.kdic") # Create the output directory if not os.path.isdir(results_dir): os.mkdir(results_dir) # Load the learning dictionary object domain = kh.read_dictionary_file(dictionary_file_path) dictionary = domain.get_dictionary("Adult") # Add a random fold index variable to the learning dictionary fold_number = 5 fold_index_variable = kh.Variable() fold_index_variable.name = "FoldIndex" fold_index_variable.type = "Numerical" fold_index_variable.used = False fold_index_variable.rule = "Ceil(Product(" + str(fold_number) + ", Random()))" dictionary.add_variable(fold_index_variable) # Add variables that indicate if the instance is in the train dataset: for fold_index in range(1, fold_number + 1): is_in_train_dataset_variable = kh.Variable() is_in_train_dataset_variable.name = "IsInTrainDataset" + str(fold_index) is_in_train_dataset_variable.type = "Numerical" is_in_train_dataset_variable.used = False is_in_train_dataset_variable.rule = "NEQ(FoldIndex, " + str(fold_index) + ")" dictionary.add_variable(is_in_train_dataset_variable) # Print dictionary with fold variables print("Dictionary file with fold variables") domain.export_khiops_dictionary_file(fold_dictionary_file_path) with open(fold_dictionary_file_path) as fold_dictionary_file: for line in fold_dictionary_file: print(line, end="") # For each fold k: print("Training Adult with " + str(fold_number) + " folds") print("\tfold\ttrain auc\ttest auc") train_aucs = [] test_aucs = [] for fold_index in range(1, fold_number + 1): # Train a model from the sub-dataset where IsInTrainDataset is 1 train_reports_path, modeling_dictionary_file_path = kh.train_predictor( domain, "Adult", data_table_path, "class", results_dir, sample_percentage=100, selection_variable="IsInTrainDataset" + str(fold_index), selection_value=1, max_trees=0, results_prefix="Fold" + str(fold_index), ) # Evaluate the resulting model in the subsets where IsInTrainDataset is 0 test_evaluation_report_path = kh.evaluate_predictor( modeling_dictionary_file_path, "Adult", data_table_path, results_dir, sample_percentage=100, selection_variable="IsInTrainDataset" + str(fold_index), selection_value=0, results_prefix="Fold" + str(fold_index), ) # Obtain the train AUC from the train report and the test AUC from the # evaluation report and print them train_results = kh.read_analysis_results_file(train_reports_path) test_evaluation_results = kh.read_analysis_results_file(test_evaluation_report_path) train_auc = train_results.train_evaluation_report.get_snb_performance().auc test_auc = test_evaluation_results.evaluation_report.get_snb_performance().auc print("\t" + str(fold_index) + "\t" + str(train_auc) + "\t" + str(test_auc)) # Store the train and test AUCs in arrays train_aucs.append(train_auc) test_aucs.append(test_auc) # Print the mean +- error aucs for both train and test mean_train_auc = sum(train_aucs) / fold_number squared_error_train_aucs = [(auc - mean_train_auc) ** 2 for auc in train_aucs] sd_train_auc = math.sqrt(sum(squared_error_train_aucs) / (fold_number - 1)) mean_test_auc = sum(test_aucs) / fold_number squared_error_test_aucs = [(auc - mean_test_auc) ** 2 for auc in test_aucs] sd_test_auc = math.sqrt(sum(squared_error_test_aucs) / (fold_number - 1)) print("final auc") print("train auc: " + str(mean_train_auc) + " +- " + str(sd_train_auc)) print("test auc: " + str(mean_test_auc) + " +- " + str(sd_test_auc)) .. autofunction:: multiple_train_predictor .. code-block:: python # Imports import os from khiops import core as kh def display_test_results(json_result_file_path): """Display some of the training results""" results = kh.read_analysis_results_file(json_result_file_path) train_performance = results.train_evaluation_report.get_snb_performance() test_performance = results.test_evaluation_report.get_snb_performance() print( "\t" + str(len(results.preparation_report.variables_statistics)) + "\t" + str(train_performance.auc) + "\t" + str(test_performance.auc) ) # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "multiple_train_predictor") # Read the dictionary file to obtain an instance of class Dictionary dictionary_domain = kh.read_dictionary_file(dictionary_file_path) dictionary = dictionary_domain.get_dictionary("Adult") # Train a SNB model using all the variables print("\t#vars\ttrain auc\ttest auc") json_result_file_path, _ = kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, sample_percentage=70, use_complement_as_test=True, max_trees=0, ) display_test_results(json_result_file_path) # Read results to obtain the variables sorted by decreasing Level analysis_results = kh.read_analysis_results_file(json_result_file_path) preparation_results = analysis_results.preparation_report # Train a sequence of models with a decreasing number of variables # We disable variables one-by-one in increasing level (predictive power) order variable_number = len(preparation_results.variables_statistics) for i in reversed(range(variable_number)): # Search the next variable variable = preparation_results.variables_statistics[i] # Disable this variable and save the dictionary with the Khiops format dictionary.get_variable(variable.name).used = False # Train the model with this dictionary domain object prefix = f"V{variable_number - 1 - i}_" json_result_file_path, _ = kh.train_predictor( dictionary_domain, "Adult", data_table_path, "class", results_dir, sample_percentage=70, use_complement_as_test=True, results_prefix=prefix, max_trees=0, ) # Show a preview of the results display_test_results(json_result_file_path) .. autofunction:: evaluate_predictor .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "evaluate_predictor") model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") # Train the predictor kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, ) # Evaluate the predictor report_file_path = kh.evaluate_predictor( model_dictionary_file_path, "Adult", data_table_path, results_dir ) print("Evaluation report available at " + report_file_path) .. autofunction:: access_predictor_evaluation_report .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "access_predictor_evaluation_report") evaluation_report_path = os.path.join(results_dir, "AllReports.khj") # Train the SNB predictor and some univariate predictors # Note: Evaluation in test is 30% by default kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, univariate_predictor_number=4, ) # Obtain the evaluation results results = kh.read_analysis_results_file(evaluation_report_path) evaluation_report = results.test_evaluation_report snb_performance = evaluation_report.get_snb_performance() # Print univariate metrics for the SNB print("\nperformance metrics for " + snb_performance.name) for metric_name in snb_performance.get_metric_names(): print(metric_name + ": " + str(snb_performance.get_metric(metric_name))) # Print the confusion matrix print("\nconfusion matrix:") confusion_matrix = snb_performance.confusion_matrix for target_value in confusion_matrix.values: print("\t" + target_value, end="") print("") for i, target_value in enumerate(confusion_matrix.values): observed_frequencies = confusion_matrix.matrix[i] print(target_value, end="") for frequency in observed_frequencies: print("\t" + str(frequency), end="") print("") # Print the head of the lift curves for the 'more' modality print("\nfirst five values of the lift curves for 'more'") snb_lift_curve = evaluation_report.get_snb_lift_curve("more") optimal_lift_curve = evaluation_report.get_classifier_lift_curve("Optimal", "more") random_lift_curve = evaluation_report.get_classifier_lift_curve("Random", "more") for i in range(5): print( str(snb_lift_curve.values[i]) + "\t" + str(optimal_lift_curve.values[i]) + "\t" + str(random_lift_curve.values[i]) ) # Print univariate metrics for an univariate predictor predictor_performance = evaluation_report.get_predictor_performance( "Univariate relationship" ) print("\n\nperformance metrics for " + predictor_performance.name) for metric_name in predictor_performance.get_metric_names(): print(metric_name + ": " + str(predictor_performance.get_metric(metric_name))) .. autofunction:: train_recoder .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_recoder") # Train the recoder model kh.train_recoder(dictionary_file_path, "Adult", data_table_path, "class", results_dir) .. autofunction:: train_recoder_with_multiple_parameters .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "train_recoder_with_multiple_parameters") # Train the recoder model kh.train_recoder( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_pairs=10, categorical_recoding_method="part label", numerical_recoding_method="part label", ) .. autofunction:: train_recoder_mt_flatten .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") results_dir = os.path.join("kh_samples", "train_recoder_mt_flatten") # Train the recoder. Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables # - The maximum number of aggregate variables to construct (1000) # - To keep all the created variables independently of their informativeness (level) # - To not recode the variables values kh.train_recoder( dictionary_file_path, "Accident", accidents_table_path, "Gravity", results_dir, additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_constructed_variables=1000, informative_variables_only=False, categorical_recoding_method="none", numerical_recoding_method="none", keep_initial_categorical_variables=True, keep_initial_numerical_variables=True, ) .. autofunction:: deploy_model .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "deploy_model") model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the predictor kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, ) # Deploy the model on the database # It will score it according to the trained predictor kh.deploy_model( model_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path ) .. autofunction:: deploy_model_mt .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") results_dir = os.path.join("kh_samples", "deploy_model_mt") model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") # Train the predictor (see train_predictor_mt for details) kh.train_predictor( dictionary_file_path, "Accident", accidents_table_path, "Gravity", results_dir, additional_data_tables={"Accident`Vehicles": vehicles_table_path}, max_trees=0, ) # Deploy the model on the database # Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables kh.deploy_model( model_dictionary_file_path, "SNB_Accident", accidents_table_path, output_data_table_path, additional_data_tables={"SNB_Accident`Vehicles": vehicles_table_path}, ) .. autofunction:: deploy_model_mt_snowflake .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "Accidents") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") users_table_path = os.path.join(accidents_dir, "Users.txt") places_table_path = os.path.join(accidents_dir, "Places.txt") results_dir = os.path.join("kh_samples", "deploy_model_mt_snowflake") model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") output_data_table_path = os.path.join(results_dir, "TransferredAccidents.txt") # Train the predictor. Besides the mandatory parameters, we specify: # - A python dictionary linking data paths to file paths for non-root tables # - To not construct any decision tree # The default number of automatic features is 100 kh.train_predictor( dictionary_file_path, "Accident", accidents_table_path, "Gravity", results_dir, additional_data_tables={ "Accident`Vehicles": vehicles_table_path, "Accident`Vehicles`Users": users_table_path, "Accident`Place": places_table_path, }, max_trees=0, ) # Deploy the model on the database # Besides the mandatory parameters, it is specified: # - A python dictionary linking data paths to file paths for non-root tables kh.deploy_model( model_dictionary_file_path, "SNB_Accident", accidents_table_path, output_data_table_path, additional_data_tables={ "SNB_Accident`Vehicles": vehicles_table_path, "SNB_Accident`Vehicles`Users": users_table_path, "SNB_Accident`Place": places_table_path, }, ) .. autofunction:: deploy_model_expert .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "deploy_model_expert") model_dictionary_file_path = os.path.join(results_dir, "Modeling.kdic") output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the predictor kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, ) # Read the dictionary file to obtain an instance of class Dictionary model_domain = kh.read_dictionary_file(model_dictionary_file_path) snb_dictionary = model_domain.get_dictionary("SNB_Adult") # Select Label (identifier) snb_dictionary.get_variable("Label").used = True # Select the variables containing the probabilities for each class for variable in snb_dictionary.variables: # The variable must have a meta data with key that start with "target_prob" for key in variable.meta_data.keys: if key.startswith("TargetProb"): variable.used = True # Deploy the model. Besides the mandatory parameters, it is specified: # - A DictionaryDomain object to use instead of the mandatory dictionary file kh.deploy_model(model_domain, "SNB_Adult", data_table_path, output_data_table_path) .. autofunction:: deploy_classifier_for_metrics .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "deploy_classifier_for_metrics") output_data_table_path = os.path.join(results_dir, "ScoresAdult.txt") # Train the classifier for the target "class" _, modeling_dictionary_file_path = kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, ) # Obtain the scores of the SNB on the test dataset to calculate the PR curve kh.deploy_predictor_for_metrics( modeling_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path, sampling_mode="Exclude sample", output_header_line=False, ) # We estimate the precision/recall for the class "more" and increasing thresholds # Note: Normally one would do this with a package (eg. sklearn.metrics) thresholds = [0.1, 0.3, 0.5, 0.7, 0.9] true_positives = {thres: 0 for thres in thresholds} false_positives = {thres: 0 for thres in thresholds} false_negatives = {thres: 0 for thres in thresholds} with open(output_data_table_path) as output_data_table: for line in output_data_table: fields = line.split("\t") true_target = fields[0] proba_more = float(fields[3]) for thres in thresholds: if true_target == "more" and proba_more >= thres: true_positives[thres] += 1 elif true_target == "more" and proba_more < thres: false_negatives[thres] += 1 elif true_target == "less" and proba_more >= thres: false_positives[thres] += 1 precision = { thres: true_positives[thres] / (true_positives[thres] + false_positives[thres]) for thres in thresholds } recall = { thres: true_positives[thres] / (true_positives[thres] + false_negatives[thres]) for thres in thresholds } # Print the curve at the selected points print("Precision and Recall for class 'more'") print("threshold\trecall\tprecision") thresholds.reverse() for thres in thresholds: print(str(thres) + "\t" + str(recall[thres]) + "\t" + str(precision[thres])) .. autofunction:: deploy_regressor_for_metrics .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "deploy_regressor_for_metrics") output_data_table_path = os.path.join(results_dir, "TrueAndPredictedAges.txt") # Train the regressor for the target "age" (with 20% train to be quick) _, modeling_dictionary_file_path = kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "age", results_dir, sample_percentage=20, max_trees=0, ) # Obtain the predicted regression values of the SNB on the test dataset estimate R2 kh.deploy_predictor_for_metrics( modeling_dictionary_file_path, "SNB_Adult", data_table_path, output_data_table_path, sample_percentage=20, sampling_mode="Exclude sample", output_header_line=False, ) # Estimate R2 # Note: Normally one would do this with a package (eg. sklearn.metrics) # First pass to estimate sums of residuals and the mean ss_res = 0 mean = 0 n_instances = 0 with open(output_data_table_path) as output_data_table: for line in output_data_table: fields = line.split("\t") true_target = float(fields[0]) predicted_target = float(fields[1]) ss_res += (true_target - predicted_target) ** 2 mean += true_target n_instances += 1 mean /= n_instances # Second pass to estimate the total sums of squares and finish the R2 estimation ss_tot = 0 with open(output_data_table_path) as output_data_table: for line in output_data_table: fields = line.split("\t") true_target = float(fields[0]) ss_tot += (true_target - mean) ** 2 r2_score = 1 - ss_res / ss_tot # Print results print("Adult 'age' regression (30% train)") print(f"R2 (explained variance) = {r2_score}") .. autofunction:: sort_data_table .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") accidents_table_path = os.path.join(accidents_dir, "Accidents.txt") output_data_table_path = os.path.join( "kh_samples", "sort_data_table", "SortedAccidents.txt", ) # Sort table kh.sort_data_table( dictionary_file_path, "Accident", accidents_table_path, output_data_table_path ) .. autofunction:: sort_data_table_expert .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths accidents_dir = os.path.join(kh.get_samples_dir(), "AccidentsSummary") dictionary_file_path = os.path.join(accidents_dir, "Accidents.kdic") vehicles_table_path = os.path.join(accidents_dir, "Vehicles.txt") output_data_table_path = os.path.join( "kh_samples", "sort_data_table_expert", "SortedVehicles.txt" ) # Sort table. Besides the mandatory parameters, it is specified: # - A list containing the sorting fields kh.sort_data_table( dictionary_file_path, "Vehicle", vehicles_table_path, output_data_table_path, sort_variables=["AccidentId", "VehicleId"], ) .. autofunction:: extract_keys_from_data_table .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") output_data_table_path = os.path.join( "kh_samples", "extract_keys_from_data_table", "KeysSpliceJunction.txt", ) # Extract keys from table "SpliceJunctionDNA" to the output table kh.extract_keys_from_data_table( dictionary_file_path, "SpliceJunctionDNA", data_table_path, output_data_table_path, ) .. autofunction:: train_coclustering .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") results_dir = os.path.join("kh_samples", "train_coclustering") # Train a coclustering model for variables "SampleId" and "Char" coclustering_report_path = kh.train_coclustering( dictionary_file_path, "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], results_dir, ) print(f"Coclustering report file available at {coclustering_report_path}") # If you have Khiops Co-Visualization installed you may open the report as follows # kh.visualize_report(coclustering_report_path) .. autofunction:: simplify_coclustering .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") results_dir = os.path.join("kh_samples", "simplify_coclustering") coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") simplified_coclustering_file_name = "simplified_coclustering.khc" # Train coclustering model for variables "SampleId" and "Char" kh.train_coclustering( dictionary_file_path, "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], results_dir, ) # Simplify the trained coclustering with the constraints # - maximum information preserved: 80% # - maximum total parts number: 4 kh.simplify_coclustering( coclustering_file_path, simplified_coclustering_file_name, results_dir, max_preserved_information=80, max_total_parts=4, ) .. autofunction:: extract_clusters .. code-block:: python # Set the file paths splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") results_dir = os.path.join("kh_samples", "extract_clusters") coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") clusters_file_path = os.path.join(results_dir, "extracted_clusters.txt") # Train a coclustering model for variables "SampleId" and "Char" kh.train_coclustering( dictionary_file_path, "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], results_dir, ) # Extract clusters kh.extract_clusters(coclustering_file_path, "Char", clusters_file_path) .. autofunction:: deploy_coclustering .. code-block:: python # Imports import os from khiops import core as kh # Set the initial file paths splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") results_dir = os.path.join("kh_samples", "deploy_coclustering") coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") # Train a coclustering model for variables "SampleId" and "Char" kh.train_coclustering( dictionary_file_path, "SpliceJunctionDNA", data_table_path, ["SampleId", "Char"], results_dir, ) # Deploy "Char" clusters in the training database kh.deploy_coclustering( dictionary_file_path, "SpliceJunctionDNA", data_table_path, coclustering_file_path, ["SampleId"], "Char", results_dir, header_line=True, ) .. autofunction:: deploy_coclustering_expert .. code-block:: python # Imports import os from khiops import core as kh # Set the initial file paths splice_dir = os.path.join(kh.get_samples_dir(), "SpliceJunction") dictionary_file_path = os.path.join(splice_dir, "SpliceJunction.kdic") data_table_path = os.path.join(splice_dir, "SpliceJunction.txt") secondary_data_table_path = os.path.join(splice_dir, "SpliceJunctionDNA.txt") results_dir = os.path.join("kh_samples", "deploy_coclustering_expert") coclustering_file_path = os.path.join(results_dir, "Coclustering.khc") # Train a coclustering model for variables "SampleId" and "Char" print("train coclustering on SpliceJunctionDNA") kh.train_coclustering( dictionary_file_path, "SpliceJunctionDNA", secondary_data_table_path, ["SampleId", "Char"], results_dir, ) print("prepare_coclustering_deployment") # The input dictionary is extended with new coclustering based variables kh.prepare_coclustering_deployment( dictionary_file_path, "SpliceJunction", coclustering_file_path, "DNA", "SampleId", results_dir, ) augmented_dictionary_file_path = os.path.join(results_dir, "Coclustering.kdic") print("prepare_coclustering_deployment with at most two clusters") # Extend the already extended dictionary with the new variables from a simplified CC kh.prepare_coclustering_deployment( augmented_dictionary_file_path, "SpliceJunction", coclustering_file_path, "DNA", "SampleId", results_dir, results_prefix="Reaugmented", variables_prefix="C2_", max_part_numbers={"SampleId": 2}, ) reaugmented_dictionary_file_path = os.path.join( results_dir, "ReaugmentedCoclustering.kdic" ) output_data_table_path = os.path.join(results_dir, "TransferredSpliceJunction.txt") # Deploy the coclustering with the extended dictionary print("deploy_model with the new coclustering based variables") kh.deploy_model( reaugmented_dictionary_file_path, "SpliceJunction", data_table_path, output_data_table_path, additional_data_tables={"SpliceJunction`DNA": secondary_data_table_path}, ) deployed_dictionary_file_path = os.path.join( results_dir, "Transferred_Coclustering.kdic" ) print("build_deployed_dictionary to get the new dictionary") kh.build_deployed_dictionary( reaugmented_dictionary_file_path, "SpliceJunction", deployed_dictionary_file_path, ) .. autofunction:: scenario_prologue .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Adult", "Adult.txt") results_dir = os.path.join("kh_samples", "scenario_prologue") # Set the maximum memory "by hand" with an scenario prologue scenario_prologue = """ // Max memory 2000 mb AnalysisSpec.SystemParameters.MemoryLimit 2000 """ # Train the predictor kh.train_predictor( dictionary_file_path, "Adult", data_table_path, "class", results_dir, max_trees=0, scenario_prologue=scenario_prologue, ) .. autofunction:: build_deployed_dictionary .. code-block:: python # Imports import os from khiops import core as kh # Set the file paths dictionary_file_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.kdic") data_table_path = os.path.join(kh.get_samples_dir(), "Iris", "Iris.txt") results_dir = os.path.join("kh_samples", "build_deployed_dictionary") deployed_dictionary_file_path = os.path.join(results_dir, "SNB_Iris_deployed.kdic") # Train the predictor _, modeling_dictionary_file_path = kh.train_predictor( dictionary_file_path, "Iris", data_table_path, "Class", results_dir, max_trees=0, ) # Build the dictionary to read the output of the predictor dictionary file # It will contain the columns of the table generated by deploying the model kh.build_deployed_dictionary( modeling_dictionary_file_path, "SNB_Iris", deployed_dictionary_file_path, ) # Print the deployed dictionary with open(deployed_dictionary_file_path) as deployed_dictionary_file: for line in deployed_dictionary_file: print(line, end="")