Source code for lightwood.encoder.base

from typing import List, Iterable
import torch


[docs]class BaseEncoder: """ Base class for all encoders. An encoder should return encoded representations of any columnar data. The procedure for this is defined inside the `encode()` method. If this encoder is expected to handle an output column, then it also needs to implement the respective `decode()` method that handles the inverse transformation from encoded representations to the final prediction in the original column space. For encoders that learn representations (as opposed to rule-based), the `prepare()` method will handle all learning logic. The `to()` method is used to move PyTorch-based encoders to and from a GPU. :param is_target: Whether the data to encode is the target, as per the problem definition. :param is_timeseries_encoder: Whether encoder represents sequential/time-series data. Lightwood must provide specific treatment for this kind of encoder :param is_trainable_encoder: Whether the encoder must return learned representations. Lightwood checks whether this flag is present in order to pass data to the feature representation via the ``prepare`` statement. Class Attributes: - is_prepared: Internal flag to signal that the `prepare()` method has been successfully executed. - is_nn_encoder: Whether the encoder is neural network-based. - dependencies: list of additional columns that the encoder might need to encode. - output_size: length of each encoding tensor for a single data point. """ # noqa is_target: bool is_prepared: bool is_timeseries_encoder: bool = False is_trainable_encoder: bool = False def __init__(self, is_target=False) -> None: self.is_target = is_target self.is_prepared = False self.dependencies = [] self.output_size = None # Not all encoders need to be prepared
[docs] def prepare(self, priming_data: Iterable[object]) -> None: """ Given 'priming_data' (i.e. training data), prepares encoders either through a rule-based (ex: one-hot encoding) or learned (ex: DistilBERT for text) model. This works explicitly on only training data. :param priming_data: An iterable data structure where all the elements have type that is compatible with the encoder processing type; this may differ per encoder. """ # noqa self.is_prepared = True
[docs] def encode(self, column_data: Iterable[object]) -> torch.Tensor: """ Given the approach defined in `prepare()`, encodes column data into a numerical representation to form part of the feature vector. After all columns are featurized, each encoded vector is concatenated to form a feature vector per row in the dataset. :param column_data: An iterable data structure where all the elements have type that is compatible with the encoder processing type; this may differ per encoder. :returns: The encoded representation of data, per column """ # noqa raise NotImplementedError
[docs] def decode(self, encoded_data: torch.Tensor) -> List[object]: """ Given an encoded representation, returns the decoded value. Decoded values may not exist for all encoders (ex: rich text, audio, etc.) :param encoded_data: The input representation in encoded format :returns: The decoded representation of data, per column, in the original data-type presented. """ # noqa raise NotImplementedError
# TODO Should work for all torch-based encoders, but custom behavior may have to be implemented for weird models def to(self, device, available_devices): # Find all nn.Module type objects and convert them # @TODO: Make this work recursively for v in vars(self): attr = getattr(self, v) if isinstance(attr, torch.nn.Module): attr.to(device) return self