Source code for lightwood.encoder.datetime.datetime

from typing import Union

import torch
import numpy as np
import pandas as pd

from lightwood.encoder.base import BaseEncoder
from lightwood.helpers.log import log


[docs]class DatetimeEncoder(BaseEncoder):
    """
    This encoder produces an encoded representation for timestamps.

    The approach consists on decomposing the timestamp objects into its constituent units (e.g. month, year, etc), and describing each of those with a single value that represents the magnitude in a sensible cycle length.
    """  # noqa
    def __init__(self, is_target: bool = False):
        super().__init__(is_target)
        self.constant_keys = ['year', 'month', 'day', 'hour', 'minute', 'second']
        self.constant_vals = torch.Tensor([3000.0, 12.0, 31.0, 24.0, 60.0, 60.0])  # cycle length
        self.constant_map = {k: v.item() for k, v in zip(self.constant_keys, self.constant_vals)}

        self.output_size = len(self.constant_keys)
        self.empty_vector = np.zeros((self.output_size, ))

        self.max_vals = torch.Tensor([pd.Timestamp.max.year - 1, 12, 31, 23, 59, 59])
        self.min_vals = torch.Tensor([pd.Timestamp.min.year + 1, 1, 1, 0, 0, 0])

[docs]    def prepare(self, priming_data):
        self.is_prepared = True

[docs]    def encode(self, data: Union[np.ndarray, pd.Series]) -> torch.Tensor:
        """
        :param data: a pandas series with numerical dtype, previously cleaned with dataprep_ml
        :return: encoded data, shape (len(data), self.output_size)
        """
        if type(data) not in (np.ndarray, pd.Series):
            raise Exception(f'Data should be pd.Series or np.ndarray! Got: {type(data)}')
        if isinstance(data, np.ndarray):
            data = pd.Series(data)

        data = data.fillna(pd.Timestamp.max.timestamp())  # TODO: replace with mean?
        ret = [pd.to_datetime(data, unit='s', origin=-1, utc=True)]
        for i, attr in enumerate(self.constant_keys):
            def _get_ts_attr(ts):
                return getattr(ts, attr)
            component = ret[0].apply(_get_ts_attr)
            component = component / self.constant_vals[i].item()
            ret.append(component)

        out = torch.Tensor(ret[1:])  # drop column with timestamp objects
        out = torch.transpose(out, 0, 1)  # swap dimensions to shape as (B, self.output_size)
        return out

[docs]    def decode(self, encoded_data: torch.Tensor, return_as_datetime=False) -> list:
        if len(encoded_data.shape) > 2 and encoded_data.shape[0] == 1:
            encoded_data = encoded_data.squeeze(0)

        rounded = torch.round(torch.multiply(encoded_data, self.constant_vals))
        high_bounded = torch.minimum(rounded, self.max_vals)
        low_bounded = torch.maximum(high_bounded, self.min_vals)
        ret = low_bounded.long()

        df = pd.DataFrame(ret, columns=self.constant_keys)
        nan_mask = df[
            (df['year'] == int(self.max_vals[0])) &
            (df['month'] == pd.Timestamp.max.month) &
            (df['day'] == pd.Timestamp.max.day)
        ].index
        dt = pd.to_datetime(df, utc=True)

        if not hasattr(dt, 'dt'):
            log.warning('DatetimeEncoder has failed to decode using microsecond precision, reverting to nanosecond. This may lead to minor discrepancies in reconstruction.')  # noqa

        if return_as_datetime is True:
            dt = dt.dt.to_pydatetime()  # return to Python datetime microsecond precision
            decoded = dt
        else:
            decoded = dt.values.astype(np.float64) // 10 ** 9

        decoded[nan_mask] = np.nan
        return decoded.tolist()