Source code for lightwood.analysis.helpers.conf_stats

from copy import deepcopy
from typing import Dict
from types import SimpleNamespace

from sklearn.preprocessing import OrdinalEncoder

from lightwood.analysis.base import BaseAnalysisBlock


[docs]class ConfStats(BaseAnalysisBlock): """ Computes confidence-related statistics on the held-out validation dataset. TODO: regression & forecasting tasks """ def __init__(self, deps=('ICP',), ece_bins: int = 10): super().__init__(deps=deps) self.ece_bins = ece_bins self.ordenc = OrdinalEncoder() self.n_decimals = 3
[docs] def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]: ns = SimpleNamespace(**kwargs) if ns.is_classification: possible_labels = ns.stats_info.train_observed_classes self.ordenc.fit([[label] for label in possible_labels]) task_type = 'categorical' elif ns.is_multi_ts: task_type = 'multi_ts' elif ns.is_numerical: task_type = 'numerical' else: return info ces, ece, mce, gscore = self._get_stats(info['result_df'], ns.normal_predictions, ns.data, ns.target, task_type) info['maximum_calibration_error'] = round(mce, self.n_decimals) info['expected_calibration_error'] = round(ece, self.n_decimals) info['binned_conf_acc_difference'] = ces info['global_calibration_score'] = round(gscore, self.n_decimals) return info
def _get_stats(self, confs, preds, data, target, task_type='categorical'): """ Computes expected and maximum calibration error for classification tasks. Amount of bins is specified by `self.ece_bins`. Data is sorted by increasing confidence prior to binning. :return: bins: bin-wise absolute difference between estimated confidence and true accuracy. ece: weighted average of all bins. mce: maximum value in `bins`. global_score: 1.0 minus absolute difference between accuracy and confidence over the entire validation set. """ confs = deepcopy(confs).reset_index(drop=True) sorted_preds = deepcopy(preds).reset_index(drop=True) sorted_inp = deepcopy(data).reset_index(drop=True) sorted_val = confs.sort_values(by='confidence', kind='stable') sorted_inp['__mdb_confidence'] = sorted_val['confidence'] if task_type == 'categorical': sorted_inp['__mdb_prediction'] = sorted_preds['prediction'] else: if isinstance(confs['lower'][0], list): sorted_inp['__mdb_lower'] = confs['lower'].apply(lambda x: x[0]) sorted_inp['__mdb_upper'] = confs['upper'].apply(lambda x: x[0]) else: sorted_inp['__mdb_lower'] = confs['lower'] sorted_inp['__mdb_upper'] = confs['upper'] sorted_inp['__mdb_hits'] = (sorted_inp['__mdb_lower'] <= sorted_inp[target]) & \ (sorted_inp[target] <= sorted_inp['__mdb_upper']) size = round(len(sorted_inp) / self.ece_bins) bins = [] ece = 0 for i in range(1, self.ece_bins): bin = sorted_inp.iloc[(i - 1) * size:i * size] if len(bin) > 0: if task_type == 'categorical': acc = sum(bin[target] == bin['__mdb_prediction']) / size else: acc = sum(bin['__mdb_hits'].astype(int)) / len(bin) conf = sum(bin['__mdb_confidence']) / size gap = abs(acc - conf) bins.append(gap) ece += gap ece /= self.ece_bins mce = max(bins) if bins else 0 if task_type == 'categorical': global_acc = sum(sorted_inp[target] == sorted_inp['__mdb_prediction']) / len(sorted_inp) else: global_acc = sum(sorted_inp['__mdb_hits'].astype(int)) / len(sorted_inp) global_conf = sorted_inp['__mdb_confidence'].mean() global_score = 1.0 - abs(global_acc - global_conf) return bins, ece, mce, global_score