Coverage for mindsdb / integrations / handlers / autokeras_handler / autokeras_handler.py: 0%

71 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-21 00:36 +0000

1import os 

2import random 

3import string 

4from typing import Optional 

5 

6import numpy as np 

7import pandas as pd 

8import autokeras as ak 

9from sklearn import preprocessing 

10from tensorflow.keras.models import load_model 

11 

12from mindsdb.integrations.libs.base import BaseMLEngine 

13 

14 

15# Makes this run on systems where this arg isn't specified, like Windows Subsystem for Linux 

16# Doesn't break things on Mac 

17os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir=/usr/lib/cuda" 

18 

19DEFAULT_TRIALS = 100 

20 

21 

22def train_model(df, target, max_trials=DEFAULT_TRIALS): 

23 """Helper function to trains an AutoKeras model with an input df. 

24 

25 Automatically decides on classification vs. regression depending on 

26 the type of the target. 

27 

28 Will auto-encode categorical variables as dummies. 

29 

30 Returns both trained model and the names of categoric dummy columns, which 

31 are passed later to the prediction method. 

32 """ 

33 # Choose regressor of classifier based on target data type 

34 if np.issubdtype(df[target].dtype, np.number): 

35 mode = "regression" 

36 y_train = df[target].to_numpy() 

37 else: 

38 mode = "classification" 

39 lb = preprocessing.LabelBinarizer() 

40 y_train = lb.fit_transform(df[target]) 

41 

42 training_df = df.drop(target, axis=1) 

43 trainer = ak.AutoModel( 

44 inputs=ak.Input(), 

45 outputs=ak.RegressionHead() if mode == "regression" else ak.ClassificationHead(), 

46 overwrite=True, 

47 max_trials=max_trials 

48 ) 

49 # Save the column names of all numeric columns before transforming any categorical columns into dummies 

50 numeric_column_names = training_df.select_dtypes(include=[np.number]).columns.values.tolist() 

51 training_df = pd.get_dummies(training_df) 

52 categorical_dummy_column_names = [ 

53 col for col in training_df.columns.values.tolist() if col not in numeric_column_names 

54 ] 

55 x_train = training_df.to_numpy() 

56 trainer.fit(x_train, y_train, verbose=2) 

57 return trainer.export_model(), categorical_dummy_column_names 

58 

59 

60def get_preds_from_model(df, model, target, column_count, categorical_dummy_column_names): 

61 """Gets predictions from the stored AutoKeras model.""" 

62 df_to_predict = df.copy() 

63 for col in ["__mindsdb_row_id", target]: 

64 if col in df_to_predict.columns.values.tolist(): 

65 df_to_predict = df_to_predict.drop(col, axis=1) 

66 

67 if len(df_to_predict.columns) != column_count: 

68 raise Exception("All feature columns must be defined in the WHERE clause when making predictions") 

69 # Get dummies for any categorical columns and then populate the missing ones with zeros 

70 df_with_dummies = pd.get_dummies(df_to_predict) 

71 for col in categorical_dummy_column_names: # exception handler for empty columns 

72 if col not in df_with_dummies.columns.values.tolist(): 

73 df_with_dummies[col] = 0 

74 

75 return model.predict(df_with_dummies, verbose=2) 

76 

77 

78def format_categorical_preds(predictions, original_y, df_to_predict, target_col): 

79 """Transforms class predictions back to their original class. 

80 

81 Categoric predictions come out the AutoKeras model in a binary 

82 format e.g. (0, 1, 0). This function maps them back to their 

83 original class e.g. 'Blue', and adds a DF column for the 

84 model confidence score. 

85 """ 

86 # Turn prediction back into categorical value 

87 lb = preprocessing.LabelBinarizer() 

88 lb.fit(original_y) 

89 preds = lb.inverse_transform(predictions) 

90 

91 # Add the confidence score next to the prediction 

92 df_to_predict[target_col] = pd.Series(preds).astype(original_y.dtype) 

93 df_to_predict["confidence"] = [max(row) for _, row in enumerate(predictions)] 

94 return df_to_predict 

95 

96 

97class AutokerasHandler(BaseMLEngine): 

98 """ 

99 Integration with the AutoKeras ML library. 

100 """ # noqa 

101 

102 name = "autokeras" 

103 

104 def create(self, target: str, df: Optional[pd.DataFrame] = None, args: Optional[dict] = None) -> None: 

105 """Create and tune AutoKeras model using the input df. 

106 

107 Saves the AutoKeras best model params to model storage. 

108 """ 

109 args = args["using"] # ignore the rest of the problem definition 

110 args["target"] = target 

111 max_trials = int(args["train_time"] * DEFAULT_TRIALS) if "train_time" in args else DEFAULT_TRIALS 

112 # Save the training df in order to filter the training data based on the predict df 

113 args["training_df"] = df.to_json() 

114 args["training_data_column_count"] = len(df.columns) - 1 # subtract 1 for target 

115 

116 random_string = "".join(random.choices(string.ascii_uppercase + string.digits, k=24)) 

117 args["folder_path"] = os.path.join("autokeras", f"{random_string}.keras") 

118 os.makedirs(os.path.dirname(args["folder_path"]), exist_ok=True) 

119 

120 model, args["data_column_names"] = train_model(df, target, max_trials) 

121 model.save(args["folder_path"]) 

122 self.model_storage.json_set("predict_args", args) 

123 

124 def predict(self, df, args=None): 

125 """Predicts with best saved AutoKeras model.""" 

126 args = self.model_storage.json_get("predict_args") 

127 training_df = pd.read_json(args["training_df"]) 

128 model = load_model(args["folder_path"], custom_objects=ak.CUSTOM_OBJECTS) 

129 

130 df_to_predict = df.copy() 

131 predictions = get_preds_from_model(df_to_predict, model, args["target"], args["training_data_column_count"], args["data_column_names"]) 

132 

133 # If we used the classifier we need to pre-process the predictions before returning them 

134 original_y = training_df[args["target"]] 

135 if not np.issubdtype(original_y.dtype, np.number): 

136 return format_categorical_preds(predictions, original_y, df_to_predict, args["target"]) 

137 

138 df_to_predict[args["target"]] = predictions 

139 return df_to_predict