Source code for lightwood.data.timeseries_analyzer

from typing import Dict, Tuple, List

import numpy as np
import pandas as pd
from type_infer.dtype import dtype

from lightwood.api.types import TimeseriesSettings
from lightwood.helpers.ts import get_ts_groups, get_delta, Differencer
from lightwood.encoder.time_series.helpers.common import generate_target_group_normalizers


[docs]def timeseries_analyzer(data: Dict[str, pd.DataFrame], dtype_dict: Dict[str, str],
                        timeseries_settings: TimeseriesSettings, target: str) -> Dict:
    """
    This module analyzes (pre-processed) time series data and stores a few useful insights used in the rest of Lightwood's pipeline.
    
    :param data: dictionary with the dataset split into train, val, test subsets. 
    :param dtype_dict: dictionary with inferred types for every column.
    :param timeseries_settings: A `TimeseriesSettings` object. For more details, check `lightwood.types.TimeseriesSettings`.
    :param target: name of the target column.
    
    The following things are extracted from each time series inside the dataset:
      - group_combinations: all observed combinations of values for the set of `group_by` columns. The length of this list determines how many time series are in the data.
      - deltas: inferred sampling interval 
      - ts_naive_residuals: Residuals obtained from the data by a naive forecaster that repeats the last-seen value. 
      - ts_naive_mae: Mean residual value obtained from the data by a naive forecaster that repeats the last-seen value.
      - target_normalizers: objects that may normalize the data within any given time series for effective learning. See `lightwood.encoder.time_series.helpers.common` for available choices.
    
    :return: Dictionary with the aforementioned insights and the `TimeseriesSettings` object for future references.
    """  # noqa
    tss = timeseries_settings
    groups = get_ts_groups(data['train'], tss)
    deltas, periods, freqs = get_delta(data['train'], tss)

    normalizers = generate_target_group_normalizers(data['train'], target, dtype_dict, tss)

    if dtype_dict[target] in (dtype.integer, dtype.float, dtype.num_tsarray):
        naive_forecast_residuals, scale_factor = get_grouped_naive_residuals(data['dev'], target, tss)
        differencers = get_differencers(data['train'], target, tss.group_by)
    else:
        naive_forecast_residuals, scale_factor = {}, {}
        differencers = {}

    return {'target_normalizers': normalizers,
            'deltas': deltas,
            'tss': tss,
            'group_combinations': groups,
            'ts_naive_residuals': naive_forecast_residuals,
            'ts_naive_mae': scale_factor,
            'periods': periods,
            'sample_freqs': freqs,
            'stl_transforms': {},  # TODO: remove, or provide from outside as user perhaps
            'differencers': differencers
            }


def get_naive_residuals(target_data: pd.DataFrame, m: int = 1) -> Tuple[List, float]:
    """
    Computes forecasting residuals for the naive method (forecasts for time `t` is the value observed at `t-1`).
    Useful for computing MASE forecasting error.

    As per arxiv.org/abs/2203.10716, we resort to a constant forecast based on the last-seen measurement across the entire horizon.
    By following the original measure, the naive forecaster would have the advantage of knowing the actual values whereas the predictor would not.

    Note: method assumes predictions are all for the same group combination. For a dataframe that contains multiple
     series, use `get_grouped_naive_resiudals`.

    :param target_data: observed time series targets
    :param m: season length. the naive forecasts will be the m-th previously seen value for each series

    :return: (list of naive residuals, average residual value)
    """  # noqa
    # @TODO: support categorical series as well
    residuals = np.abs(target_data.values[1:] - target_data.values[0]).flatten()
    scale_factor = np.average(residuals)
    return residuals.tolist(), scale_factor


def get_grouped_naive_residuals(
        info: pd.DataFrame,
        target: str,
        tss: TimeseriesSettings
) -> Tuple[Dict, Dict]:
    """
    Wraps `get_naive_residuals` for a dataframe with multiple co-existing time series.
    """  # noqa
    group_residuals = {}
    group_scale_factors = {}
    grouped = info.groupby(by=tss.group_by) if tss.group_by else info.groupby(lambda x: '__default')
    for group, subset in grouped:
        if subset.shape[0] > 1:
            residuals, scale_factor = get_naive_residuals(subset[target])  # @TODO: pass m once we handle seasonality
            group_residuals[group] = residuals
            group_scale_factors[group] = scale_factor
    return group_residuals, group_scale_factors


def get_differencers(data: pd.DataFrame, target: str, group_cols: List):
    differencers = {}
    grouped = data.groupby(by=group_cols) if group_cols else data.groupby(lambda x: True)
    for group, subset in grouped:
        differencer = Differencer()
        differencer.fit(subset[target].values)
        differencers[group] = differencer
    return differencers