Coverage for mindsdb/integrations/handlers/autokeras_handler/autokeras

1import os

2import random

3import string

4from typing import Optional

6import numpy as np

7import pandas as pd

8import autokeras as ak

9from sklearn import preprocessing

10from tensorflow.keras.models import load_model

12from mindsdb.integrations.libs.base import BaseMLEngine

15# Makes this run on systems where this arg isn't specified, like Windows Subsystem for Linux

16# Doesn't break things on Mac

17os.environ["XLA_FLAGS"] = "--xla_gpu_cuda_data_dir=/usr/lib/cuda"

19DEFAULT_TRIALS = 100

22def train_model(df, target, max_trials=DEFAULT_TRIALS):

23 """Helper function to trains an AutoKeras model with an input df.

25 Automatically decides on classification vs. regression depending on

26 the type of the target.

28 Will auto-encode categorical variables as dummies.

30 Returns both trained model and the names of categoric dummy columns, which

31 are passed later to the prediction method.

32 """

33 # Choose regressor of classifier based on target data type

34 if np.issubdtype(df[target].dtype, np.number):

35 mode = "regression"

36 y_train = df[target].to_numpy()

37 else:

38 mode = "classification"

39 lb = preprocessing.LabelBinarizer()

40 y_train = lb.fit_transform(df[target])

42 training_df = df.drop(target, axis=1)

43 trainer = ak.AutoModel(

44 inputs=ak.Input(),

45 outputs=ak.RegressionHead() if mode == "regression" else ak.ClassificationHead(),

46 overwrite=True,

47 max_trials=max_trials

48 )

49 # Save the column names of all numeric columns before transforming any categorical columns into dummies

50 numeric_column_names = training_df.select_dtypes(include=[np.number]).columns.values.tolist()

51 training_df = pd.get_dummies(training_df)

52 categorical_dummy_column_names = [

53 col for col in training_df.columns.values.tolist() if col not in numeric_column_names

54 ]

55 x_train = training_df.to_numpy()

56 trainer.fit(x_train, y_train, verbose=2)

57 return trainer.export_model(), categorical_dummy_column_names

60def get_preds_from_model(df, model, target, column_count, categorical_dummy_column_names):

61 """Gets predictions from the stored AutoKeras model."""

62 df_to_predict = df.copy()

63 for col in ["__mindsdb_row_id", target]:

64 if col in df_to_predict.columns.values.tolist():

65 df_to_predict = df_to_predict.drop(col, axis=1)

67 if len(df_to_predict.columns) != column_count:

68 raise Exception("All feature columns must be defined in the WHERE clause when making predictions")

69 # Get dummies for any categorical columns and then populate the missing ones with zeros

70 df_with_dummies = pd.get_dummies(df_to_predict)

71 for col in categorical_dummy_column_names: # exception handler for empty columns

72 if col not in df_with_dummies.columns.values.tolist():

73 df_with_dummies[col] = 0

75 return model.predict(df_with_dummies, verbose=2)

78def format_categorical_preds(predictions, original_y, df_to_predict, target_col):

79 """Transforms class predictions back to their original class.

81 Categoric predictions come out the AutoKeras model in a binary

82 format e.g. (0, 1, 0). This function maps them back to their

83 original class e.g. 'Blue', and adds a DF column for the

84 model confidence score.

85 """

86 # Turn prediction back into categorical value

87 lb = preprocessing.LabelBinarizer()

88 lb.fit(original_y)

89 preds = lb.inverse_transform(predictions)

91 # Add the confidence score next to the prediction

92 df_to_predict[target_col] = pd.Series(preds).astype(original_y.dtype)

93 df_to_predict["confidence"] = [max(row) for _, row in enumerate(predictions)]

94 return df_to_predict

97class AutokerasHandler(BaseMLEngine):

98 """

99 Integration with the AutoKeras ML library.

100 """ # noqa

101

102 name = "autokeras"

103

104 def create(self, target: str, df: Optional[pd.DataFrame] = None, args: Optional[dict] = None) -> None:

105 """Create and tune AutoKeras model using the input df.

106

107 Saves the AutoKeras best model params to model storage.

108 """

109 args = args["using"] # ignore the rest of the problem definition

110 args["target"] = target

111 max_trials = int(args["train_time"] * DEFAULT_TRIALS) if "train_time" in args else DEFAULT_TRIALS

112 # Save the training df in order to filter the training data based on the predict df

113 args["training_df"] = df.to_json()

114 args["training_data_column_count"] = len(df.columns) - 1 # subtract 1 for target

115

116 random_string = "".join(random.choices(string.ascii_uppercase + string.digits, k=24))

117 args["folder_path"] = os.path.join("autokeras", f"{random_string}.keras")

118 os.makedirs(os.path.dirname(args["folder_path"]), exist_ok=True)

119

120 model, args["data_column_names"] = train_model(df, target, max_trials)

121 model.save(args["folder_path"])

122 self.model_storage.json_set("predict_args", args)

123

124 def predict(self, df, args=None):

125 """Predicts with best saved AutoKeras model."""

126 args = self.model_storage.json_get("predict_args")

127 training_df = pd.read_json(args["training_df"])

128 model = load_model(args["folder_path"], custom_objects=ak.CUSTOM_OBJECTS)

129

130 df_to_predict = df.copy()

131 predictions = get_preds_from_model(df_to_predict, model, args["target"], args["training_data_column_count"], args["data_column_names"])

132

133 # If we used the classifier we need to pre-process the predictions before returning them

134 original_y = training_df[args["target"]]

135 if not np.issubdtype(original_y.dtype, np.number):

136 return format_categorical_preds(predictions, original_y, df_to_predict, args["target"])

137

138 df_to_predict[args["target"]] = predictions

139 return df_to_predict

Coverage for mindsdb / integrations / handlers / autokeras_handler / autokeras_handler.py: 0%

71 statements