Source code for lightwood.encoder.categorical.autoencoder

import random
import numpy as np
import torch
from torch.utils.data import DataLoader
from lightwood.mixer.helpers.ranger import Ranger
from lightwood.encoder.categorical.simple_label import SimpleLabelEncoder
from lightwood.encoder.categorical.onehot import OneHotEncoder
from lightwood.encoder.categorical.gym import Gym
from lightwood.encoder.base import BaseEncoder
from lightwood.helpers.log import log
from lightwood.mixer.helpers.default_net import DefaultNet
import pandas as pd

from typing import Iterable, Tuple, List


[docs]class CategoricalAutoEncoder(BaseEncoder): """ Trains an autoencoder (AE) to represent categorical information with over 100 categories. This is used to ensure that feature vectors for categorical data with many categories are not excessively large. The AE defaults to a vector sized 100 but can be adjusted to user preference. It is highly advised NOT to use this encoder to feature engineer your target, as reconstruction accuracy will determine your AE's ability to decode properly. """ # noqa is_trainable_encoder: bool = True def __init__( self, stop_after: float = 3600, is_target: bool = False, max_encoded_length: int = 100, desired_error: float = 0.01, batch_size: int = 200, device: str = '', input_encoder: str = None ): """ :param stop_after: Stops training with provided time limit (sec) :param is_target: Encoder represents target class (NOT recommended) :param max_encoded_length: Maximum length of vector represented :param desired_error: Threshold for reconstruction accuracy error :param batch_size: Minimum batch size while training :param device: Name of the device that get_device_from_name will attempt to use :param input_encoder: one of `OneHotEncoder` or `SimpleLabelEncoder` to force usage of the underlying input encoder. Note that OHE does not scale for categorical features with high cardinality, while SLE can but is less accurate overall. """ # noqa super().__init__(is_target) self.is_prepared = False self.name = 'Categorical Autoencoder' self.output_size = max_encoded_length # Model details self.net = None self.encoder = None self.decoder = None self.input_encoder = None # TBD at prepare() self.device_type = device self.input_encoder = input_encoder # Training details self.batch_size = batch_size self.desired_error = desired_error self.stop_after = stop_after
[docs] def prepare(self, train_priming_data: pd.Series, dev_priming_data: pd.Series): """ Creates inputs and prepares a categorical autoencoder (CatAE) for input data. Currently, does not support a dev set; inputs for train and dev are concatenated together to train an autoencoder. :param train_priming_data: Input training data :param dev_priming_data: Input dev data (Not supported currently) """ # noqa if self.is_prepared: raise Exception('You can only call "prepare" once for a given encoder.') if self.is_target: log.warning('You are trying to use an autoencoder for the target value! This is very likely a bad idea.') error_msg = f'Provided an invalid input encoder ({self.input_encoder}), please use either `OneHotEncoder` or `SimpleLabelEncoder`.' # noqa if self.input_encoder is not None: assert self.input_encoder in ('OneHotEncoder', 'SimpleLabelEncoder'), error_msg log.info('Preparing a categorical autoencoder.') if self.input_encoder == 'SimpleLabelEncoder' or \ (self.input_encoder is None and train_priming_data.nunique() > 500): log.info('Deploying SimpleLabelEncoder for CategoricalAutoEncoder input.') self.input_encoder = SimpleLabelEncoder(is_target=self.is_target) input_len = self.input_encoder.output_size self.output_size = 32 net_shape = [input_len, 128, 64, self.output_size, input_len] else: log.info('Deploying OneHotEncoder for CategoricalAutoEncoder input.') self.input_encoder = OneHotEncoder(is_target=self.is_target) net_shape = None # defined at prepare() due to the OHE output size being determined then train_loader, dev_loader = self._prepare_AE_input( train_priming_data, dev_priming_data ) best_model = self._prepare_catae(train_loader, dev_loader, net_shape=net_shape) self.net = best_model.to(self.net.device) modules = [ module for module in self.net.modules() if type(module) != torch.nn.Sequential and type(module) != DefaultNet ] self.encoder = torch.nn.Sequential(*modules[0:-1]).eval() self.decoder = torch.nn.Sequential(*modules[-1:]).eval() log.info('Categorical autoencoder ready.') self.is_prepared = True
[docs] def encode(self, column_data: Iterable[str]) -> torch.Tensor: """ Encodes categorical information in column as the compressed vector from the CatAE. :param column_data: An iterable of category samples from a column :returns: An embedding for each sample in original input """ # noqa encoded_tensor = self.input_encoder.encode(column_data) with torch.no_grad(): encoded_tensor = encoded_tensor.to(self.net.device) if len(encoded_tensor.shape) < 2: encoded_tensor = encoded_tensor.unsqueeze(-1) embeddings = self.encoder(encoded_tensor) return embeddings.to('cpu')
[docs] def decode(self, encoded_data: torch.Tensor) -> List[str]: """ Decodes from the embedding space, the original categories. ..warning If your reconstruction accuracy is not 100%, the CatAE may not return the correct category. :param encoded_data: A torch tensor of embeddings for category predictions :returns: A list of 'translated' categories for each embedding """ # noqa with torch.no_grad(): encoded_data = encoded_data.to(self.net.device) encoded_tensor = self.decoder(encoded_data) encoded_tensor = encoded_tensor.to('cpu') return self.input_encoder.decode(encoded_tensor)
def _prepare_AE_input( self, train_priming_data: pd.Series, dev_priming_data: pd.Series ) -> Tuple[DataLoader, DataLoader]: """ Creates the data loaders for the CatAE model inputs. Expected inputs are generally of form `pd.Series` Currently does not use 'dev'; concatenates both inputs together. Input to the `DataLoader` must be an Iterable[str] (ideally List[str]) """ # noqa if len(dev_priming_data) > 0: priming_data = ( pd.concat([train_priming_data, dev_priming_data]).astype(str).tolist() ) else: priming_data = [str(x) for x in train_priming_data] random.seed(len(priming_data)) # Prepare a one-hot encoder for CatAE inputs self.input_encoder.prepare(priming_data) self.batch_size = max(min(self.batch_size, int(len(priming_data) / 50)), 1) train_loader = DataLoader( list(zip(priming_data, priming_data)), batch_size=self.batch_size, shuffle=True, ) # TODO; make `Gym` compatible with a dev set dev_loader = None return train_loader, dev_loader def _prepare_catae(self, train_loader: DataLoader, dev_loader: DataLoader, net_shape=None): """ Trains the CatAE using Lightwood's `Gym` class. :param train_loader: Training dataset Loader :param dev_loader: Validation set DataLoader """ # noqa if net_shape is None: input_len = self.input_encoder.output_size net_shape = [input_len, self.output_size, input_len] self.net = DefaultNet(shape=net_shape, device=self.device_type) if isinstance(self.input_encoder, OneHotEncoder): criterion = torch.nn.CrossEntropyLoss() desired_error = self.desired_error elif isinstance(self.input_encoder, SimpleLabelEncoder): criterion = torch.nn.MSELoss() desired_error = 1e-9 else: raise Exception(f'[CatAutoEncoder] Input encoder of type {type(self.input_encoder)} is not supported!') if isinstance(self.input_encoder, OneHotEncoder): optimizer = Ranger(self.net.parameters()) output_encoder = self._encoder_targets max_time = self.stop_after else: optimizer = Ranger(self.net.parameters(), weight_decay=1e-2) output_encoder = self._label_targets max_time = 60 * 2 gym = Gym( model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, device=self.net.device, name=self.name, input_encoder=self.input_encoder.encode, output_encoder=output_encoder, ) best_model, _, _ = gym.fit( train_loader, dev_loader, desired_error=desired_error, max_time=max_time, eval_every_x_epochs=1, max_unimproving_models=5, ) return best_model def _encoder_targets(self, data): """ Encodes target data with a OHE encoder """ encoded_categories = self.input_encoder.encode(data) target = encoded_categories.cpu().numpy() target_indexes = np.where(target > 0)[1] targets_c = torch.LongTensor(target_indexes) labels = targets_c.to(self.net.device) return labels def _label_targets(self, data): """ Encodes target data with a label encoder """ data = pd.Series(data) enc = self.input_encoder.encode(data) if len(enc.shape) < 2: enc = enc.unsqueeze(-1) return enc