Coverage for mindsdb / integrations / handlers / autokeras_handler / autokeras_handler.py: 0%
71 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-21 00:36 +0000
1import os
2import random
3import string
4from typing import Optional
6import numpy as np
7import pandas as pd
8import autokeras as ak
9from sklearn import preprocessing
10from tensorflow.keras.models import load_model
12from mindsdb.integrations.libs.base import BaseMLEngine
15# Makes this run on systems where this arg isn't specified, like Windows Subsystem for Linux
16# Doesn't break things on Mac
17os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir=/usr/lib/cuda"
19DEFAULT_TRIALS = 100
22def train_model(df, target, max_trials=DEFAULT_TRIALS):
23 """Helper function to trains an AutoKeras model with an input df.
25 Automatically decides on classification vs. regression depending on
26 the type of the target.
28 Will auto-encode categorical variables as dummies.
30 Returns both trained model and the names of categoric dummy columns, which
31 are passed later to the prediction method.
32 """
33 # Choose regressor of classifier based on target data type
34 if np.issubdtype(df[target].dtype, np.number):
35 mode = "regression"
36 y_train = df[target].to_numpy()
37 else:
38 mode = "classification"
39 lb = preprocessing.LabelBinarizer()
40 y_train = lb.fit_transform(df[target])
42 training_df = df.drop(target, axis=1)
43 trainer = ak.AutoModel(
44 inputs=ak.Input(),
45 outputs=ak.RegressionHead() if mode == "regression" else ak.ClassificationHead(),
46 overwrite=True,
47 max_trials=max_trials
48 )
49 # Save the column names of all numeric columns before transforming any categorical columns into dummies
50 numeric_column_names = training_df.select_dtypes(include=[np.number]).columns.values.tolist()
51 training_df = pd.get_dummies(training_df)
52 categorical_dummy_column_names = [
53 col for col in training_df.columns.values.tolist() if col not in numeric_column_names
54 ]
55 x_train = training_df.to_numpy()
56 trainer.fit(x_train, y_train, verbose=2)
57 return trainer.export_model(), categorical_dummy_column_names
60def get_preds_from_model(df, model, target, column_count, categorical_dummy_column_names):
61 """Gets predictions from the stored AutoKeras model."""
62 df_to_predict = df.copy()
63 for col in ["__mindsdb_row_id", target]:
64 if col in df_to_predict.columns.values.tolist():
65 df_to_predict = df_to_predict.drop(col, axis=1)
67 if len(df_to_predict.columns) != column_count:
68 raise Exception("All feature columns must be defined in the WHERE clause when making predictions")
69 # Get dummies for any categorical columns and then populate the missing ones with zeros
70 df_with_dummies = pd.get_dummies(df_to_predict)
71 for col in categorical_dummy_column_names: # exception handler for empty columns
72 if col not in df_with_dummies.columns.values.tolist():
73 df_with_dummies[col] = 0
75 return model.predict(df_with_dummies, verbose=2)
78def format_categorical_preds(predictions, original_y, df_to_predict, target_col):
79 """Transforms class predictions back to their original class.
81 Categoric predictions come out the AutoKeras model in a binary
82 format e.g. (0, 1, 0). This function maps them back to their
83 original class e.g. 'Blue', and adds a DF column for the
84 model confidence score.
85 """
86 # Turn prediction back into categorical value
87 lb = preprocessing.LabelBinarizer()
88 lb.fit(original_y)
89 preds = lb.inverse_transform(predictions)
91 # Add the confidence score next to the prediction
92 df_to_predict[target_col] = pd.Series(preds).astype(original_y.dtype)
93 df_to_predict["confidence"] = [max(row) for _, row in enumerate(predictions)]
94 return df_to_predict
97class AutokerasHandler(BaseMLEngine):
98 """
99 Integration with the AutoKeras ML library.
100 """ # noqa
102 name = "autokeras"
104 def create(self, target: str, df: Optional[pd.DataFrame] = None, args: Optional[dict] = None) -> None:
105 """Create and tune AutoKeras model using the input df.
107 Saves the AutoKeras best model params to model storage.
108 """
109 args = args["using"] # ignore the rest of the problem definition
110 args["target"] = target
111 max_trials = int(args["train_time"] * DEFAULT_TRIALS) if "train_time" in args else DEFAULT_TRIALS
112 # Save the training df in order to filter the training data based on the predict df
113 args["training_df"] = df.to_json()
114 args["training_data_column_count"] = len(df.columns) - 1 # subtract 1 for target
116 random_string = "".join(random.choices(string.ascii_uppercase + string.digits, k=24))
117 args["folder_path"] = os.path.join("autokeras", f"{random_string}.keras")
118 os.makedirs(os.path.dirname(args["folder_path"]), exist_ok=True)
120 model, args["data_column_names"] = train_model(df, target, max_trials)
121 model.save(args["folder_path"])
122 self.model_storage.json_set("predict_args", args)
124 def predict(self, df, args=None):
125 """Predicts with best saved AutoKeras model."""
126 args = self.model_storage.json_get("predict_args")
127 training_df = pd.read_json(args["training_df"])
128 model = load_model(args["folder_path"], custom_objects=ak.CUSTOM_OBJECTS)
130 df_to_predict = df.copy()
131 predictions = get_preds_from_model(df_to_predict, model, args["target"], args["training_data_column_count"], args["data_column_names"])
133 # If we used the classifier we need to pre-process the predictions before returning them
134 original_y = training_df[args["target"]]
135 if not np.issubdtype(original_y.dtype, np.number):
136 return format_categorical_preds(predictions, original_y, df_to_predict, args["target"])
138 df_to_predict[args["target"]] = predictions
139 return df_to_predict