Source code for CompStats.interface

# Copyright 2025 Sergio Nava Muñoz and Mario Graff Guerrero

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import clone
import pandas as pd
import numpy as np
from CompStats.bootstrap import StatisticSamples
from CompStats.utils import progress_bar
from CompStats import measurements
from CompStats.measurements import SE
from CompStats.utils import dataframe


[docs] class Perf(object): """Perf is an entry point to CompStats :param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement. :type y_true: numpy.ndarray or pandas.DataFrame :param score_func: Function to measure the performance, it is assumed that the best algorithm has the highest value. :type score_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.` :param error_func: Function to measure the performance where the best algorithm has the lowest value. :type error_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.` :param y_pred: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.` :type y_pred: numpy.ndarray :param kwargs: Predictions, the algorithms will be identified using the keyword :type kwargs: numpy.ndarray :param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads. :type n_jobs: int :param num_samples: Number of bootstrap samples, default=500. :type num_samples: int :param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True. :type use_tqdm: bool >>> from sklearn.svm import LinearSVC >>> from sklearn.linear_model import LogisticRegression >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from sklearn.base import clone >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, forest=ens.predict(X_val)) >>> perf <Perf> Statistic with its standard error (se) statistic (se) 0.9792 (0.0221) <= alg-1 0.9744 (0.0246) <= forest If an algorithm's prediction is missing, this can be included by calling the instance, as can be seen in the following instruction. Note that the algorithm's name can also be given with the keyword :py:attr:`name.` >>> lr = LogisticRegression().fit(X_train, y_train) >>> perf(lr.predict(X_val), name='Log. Reg.') <Perf> Statistic with its standard error (se) statistic (se) 1.0000 (0.0000) <= Log. Reg. 0.9792 (0.0221) <= alg-1 0.9744 (0.0246) <= forest The performance function used to compare the algorithms can be changed, and the same bootstrap samples would be used if the instance were cloned. Consequently, the values are computed using the same samples, as can be seen in the following example. >>> perf_error = clone(perf) >>> perf_error.error_func = lambda y, hy: (y != hy).mean() >>> perf_error <Perf> Statistic with its standard error (se) statistic (se) 0.0000 (0.0000) <= Log. Reg. 0.0222 (0.0237) <= alg-1 0.0222 (0.0215) <= forest """
[docs] def __init__(self, y_true, *y_pred, score_func=balanced_accuracy_score, error_func=None, num_samples: int=500, n_jobs: int=-1, use_tqdm=True, **kwargs): assert (score_func is None) ^ (error_func is None) self.score_func = score_func self.error_func = error_func algs = {} for k, v in enumerate(y_pred): algs[f'alg-{k+1}'] = np.asanyarray(v) algs.update(**kwargs) self.predictions = algs self.y_true = y_true self.num_samples = num_samples self.n_jobs = n_jobs self.use_tqdm = use_tqdm self.sorting_func = np.linalg.norm self._init()
def _init(self): """Compute the bootstrap statistic""" bib = True if self.score_func is not None else False if hasattr(self, '_statistic_samples'): _ = self.statistic_samples _.BiB = bib else: _ = StatisticSamples(statistic=self.statistic_func, n_jobs=self.n_jobs, num_samples=self.num_samples, BiB=bib) _.samples(N=self.y_true.shape[0]) self.statistic_samples = _
[docs] def get_params(self): """Parameters""" return dict(y_true=self.y_true, score_func=self.score_func, error_func=self.error_func, num_samples=self.num_samples, n_jobs=self.n_jobs)
def __sklearn_clone__(self): klass = self.__class__ params = self.get_params() ins = klass(**params) ins.predictions = dict(self.predictions) ins._statistic_samples._samples = self.statistic_samples._samples ins.sorting_func = self.sorting_func return ins def __repr__(self): """Prediction statistics with standard error in parenthesis""" arg = 'score_func' if self.error_func is None else 'error_func' func_name = self.statistic_func.__name__ statistic = self.statistic if isinstance(statistic, dict): return f"<{self.__class__.__name__}({arg}={func_name})>\n{self}" elif isinstance(statistic, float): return f"<{self.__class__.__name__}({arg}={func_name}, statistic={statistic:0.4f}, se={self.se:0.4f})>" desc = [f'{k:0.4f}' for k in statistic] desc = ', '.join(desc) desc_se = [f'{k:0.4f}' for k in self.se] desc_se = ', '.join(desc_se) return f"<{self.__class__.__name__}({arg}={func_name}, statistic=[{desc}], se=[{desc_se}])>" def __str__(self): """Prediction statistics with standard error in parenthesis""" if not isinstance(self.statistic, dict): return self.__repr__() se = self.se output = ["Statistic with its standard error (se)"] output.append("statistic (se)") for key, value in self.statistic.items(): if isinstance(value, float): desc = f'{value:0.4f} ({se[key]:0.4f}) <= {key}' else: desc = [f'{v:0.4f} ({k:0.4f})' for v, k in zip(value, se[key])] desc = ', '.join(desc) desc = f'{desc} <= {key}' output.append(desc) return "\n".join(output) def __call__(self, y_pred, name=None): """Add predictions""" if name is None: k = len(self.predictions) + 1 if k == 0: k = 1 name = f'alg-{k}' self.best = None self.predictions[name] = np.asanyarray(y_pred) samples = self._statistic_samples calls = samples.calls if name in calls: del calls[name] return self
[docs] def difference(self, wrt: str=None): """Compute the difference w.r.t any algorithm by default is the best >>> from sklearn.svm import LinearSVC >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from sklearn.base import clone >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, forest=ens.predict(X_val)) >>> perf.difference() <Difference> difference p-values w.r.t alg-1 forest 0.06 """ if wrt is None: wrt = self.best if isinstance(wrt, str): base = self.statistic_samples.calls[wrt] else: base = np.array([self.statistic_samples.calls[key][:, col] for col, key in enumerate(wrt)]).T sign = 1 if self.statistic_samples.BiB else -1 diff = dict() for k, v in self.statistic_samples.calls.items(): if base.ndim == 1 and k == wrt: continue diff[k] = sign * (base - v) diff_ins = Difference(statistic_samples=clone(self.statistic_samples), statistic=self.statistic) diff_ins.sorting_func = self.sorting_func diff_ins.statistic_samples.calls = diff diff_ins.statistic_samples.info['best'] = self.best diff_ins.best = self.best return diff_ins
@property def best(self): """System with best performance""" if hasattr(self, '_best') and self._best is not None: return self._best if not isinstance(self.statistic, dict): key, value = list(self.statistic_samples.calls.items())[0] if value.ndim == 1: self._best = key else: self._best = np.array([key] * value.shape[1]) return self._best BiB = bool(self.statistic_samples.BiB) keys = np.array(list(self.statistic.keys())) data = np.asanyarray([self.statistic[k] for k in keys]) if isinstance(self.statistic[keys[0]], np.ndarray): if BiB: best = data.argmax(axis=0) else: best = data.argmin(axis=0) else: if BiB: best = data.argmax() else: best = data.argmin() self._best = keys[best] return self._best @best.setter def best(self, value): self._best = value @property def sorting_func(self): """Rank systems when multiple performances are used""" return self._sorting_func @sorting_func.setter def sorting_func(self, value): self._sorting_func = value @property def statistic(self): """Statistic >>> from sklearn.svm import LinearSVC >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, forest=ens.predict(X_val)) >>> perf.statistic {'alg-1': 1.0, 'forest': 0.9500891265597148} """ data = sorted([(k, self.statistic_func(self.y_true, v)) for k, v in self.predictions.items()], key=lambda x: self.sorting_func(x[1]), reverse=self.statistic_samples.BiB) if len(data) == 1: return data[0][1] return dict(data) @property def se(self): """Standard Error >>> from sklearn.svm import LinearSVC >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, forest=ens.predict(X_val)) >>> perf.se {'alg-1': 0.0, 'forest': 0.026945730782184187} """ output = SE(self.statistic_samples) if len(output) == 1: return list(output.values())[0] return output
[docs] def plot(self, value_name:str=None, var_name:str='Performance', alg_legend:str='Algorithm', perf_names:list=None, CI:float=0.05, kind:str='point', linestyle:str='none', col_wrap:int=3, capsize:float=0.2, comparison:bool=True, right:bool=True, comp_legend:str='Comparison', winner_legend:str='Best', tie_legend:str='Equivalent', loser_legend:str='Different', **kwargs): """plot with seaborn >>> from sklearn.svm import LinearSVC >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, score_func=None, error_func=lambda y, hy: (y != hy).mean(), forest=ens.predict(X_val)) >>> perf.plot() """ import seaborn as sns if value_name is None: if self.score_func is not None: value_name = 'Score' else: value_name = 'Error' if not isinstance(self.statistic, dict): comparison = False best = self.best if isinstance(best, np.ndarray): if best.shape[0] < col_wrap: col_wrap = best.shape[0] df = self.dataframe(value_name=value_name, var_name=var_name, alg_legend=alg_legend, perf_names=perf_names, comparison=comparison, alpha=CI, right=right, comp_legend=comp_legend, winner_legend=winner_legend, tie_legend=tie_legend, loser_legend=loser_legend) if var_name not in df.columns: var_name = None col_wrap = None ci = lambda x: measurements.CI(x, alpha=CI) if comparison: kwargs.update(dict(hue=comp_legend)) f_grid = sns.catplot(df, x=value_name, errorbar=ci, y=alg_legend, col=var_name, kind=kind, linestyle=linestyle, col_wrap=col_wrap, capsize=capsize, **kwargs) return f_grid
[docs] def dataframe(self, comparison:bool=False, right:bool=True, alpha:float=0.05, value_name:str='Score', var_name:str='Performance', alg_legend:str='Algorithm', comp_legend:str='Comparison', winner_legend:str='Best', tie_legend:str='Equivalent', loser_legend:str='Different', perf_names:str=None): """Dataframe""" if perf_names is None and isinstance(self.best, np.ndarray): func_name = self.statistic_func.__name__ perf_names = [f'{func_name}({i})' for i, k in enumerate(self.best)] df = dataframe(self, value_name=value_name, var_name=var_name, alg_legend=alg_legend, perf_names=perf_names) if not comparison: return df df[comp_legend] = tie_legend diff = self.difference() best = self.best if isinstance(best, str): for name, p in diff.p_value(right=right).items(): if p >= alpha: continue df.loc[df[alg_legend] == name, comp_legend] = loser_legend df.loc[df[alg_legend] == best, comp_legend] = winner_legend else: p_values = diff.p_value(right=right) systems = list(p_values.keys()) p_values = np.array([p_values[k] for k in systems]) for name, p_value, winner in zip(perf_names, p_values.T, best): mask = df[var_name] == name for alg, p in zip(systems, p_value): if p >= alpha and winner != alg: continue _ = mask & (df[alg_legend] == alg) if winner == alg: df.loc[_, comp_legend] = winner_legend else: df.loc[_, comp_legend] = loser_legend return df
@property def n_jobs(self): """Number of jobs to compute the statistics""" return self._n_jobs @n_jobs.setter def n_jobs(self, value): self._n_jobs = value @property def statistic_func(self): """Statistic function""" if self.score_func is not None: return self.score_func return self.error_func @property def statistic_samples(self): """Statistic Samples""" samples = self._statistic_samples algs = set(samples.calls.keys()) algs = set(self.predictions.keys()) - algs if len(algs): for key in progress_bar(algs, use_tqdm=self.use_tqdm): samples(self.y_true, self.predictions[key], name=key) return self._statistic_samples @statistic_samples.setter def statistic_samples(self, value): self._statistic_samples = value @property def num_samples(self): """Number of bootstrap samples""" return self._num_samples @num_samples.setter def num_samples(self, value): self._num_samples = value @property def predictions(self): """Predictions""" return self._predictions @predictions.setter def predictions(self, value): self._predictions = value @property def y_true(self): """True output, gold standard o :math:`y`""" return self._y_true @y_true.setter def y_true(self, value): if isinstance(value, pd.DataFrame): self._y_true = value['y'].to_numpy() algs = {} for c in value.columns: if c == 'y': continue algs[c] = value[c].to_numpy() self.predictions.update(algs) return self._y_true = value @property def score_func(self): """Score function""" return self._score_func @score_func.setter def score_func(self, value): self._score_func = value if value is not None: self.error_func = None if hasattr(self, '_statistic_samples'): self._statistic_samples.statistic = value self._statistic_samples.BiB = True @property def error_func(self): """Error function""" return self._error_func @error_func.setter def error_func(self, value): self._error_func = value if value is not None: self.score_func = None if hasattr(self, '_statistic_samples'): self._statistic_samples.statistic = value self._statistic_samples.BiB = False
[docs] @dataclass class Difference: """Difference >>> from sklearn.svm import LinearSVC >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from sklearn.base import clone >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, forest=ens.predict(X_val)) >>> diff = perf.difference() >>> diff <Difference> difference p-values w.r.t alg-1 0.0780 <= forest """ statistic_samples:StatisticSamples=None statistic:dict=None best:str=None @property def sorting_func(self): """Rank systems when multiple performances are used""" return self._sorting_func @sorting_func.setter def sorting_func(self, value): self._sorting_func = value def __repr__(self): """p-value""" return f"<{self.__class__.__name__}>\n{self}" def __str__(self): """p-value""" if isinstance(self.best, str): best = f' w.r.t {self.best}' else: best = '' output = [f"difference p-values {best}"] best = self.best if isinstance(best, np.ndarray): desc = ', '.join(best) output.append(f'{desc} <= Best') for key, value in self.p_value().items(): if isinstance(value, float): output.append(f'{value:0.4f} <= {key}') else: desc = [f'{v:0.4f}' for v in value] desc = ', '.join(desc) desc = f'{desc} <= {key}' output.append(desc) return "\n".join(output) def _delta_best(self): """Compute multiple delta""" if isinstance(self.best, str): return self.statistic[self.best] keys = np.unique(self.best) statistic = np.array([self.statistic[k] for k in keys]) m = {v: k for k, v in enumerate(keys)} best = np.array([m[x] for x in self.best]) return statistic[best, np.arange(best.shape[0])]
[docs] def p_value(self, right:bool=True): """Compute p_value of the differences :param right: Estimate the p-value using :math:`\\text{sample} \\geq 2\\delta` :type right: bool >>> from sklearn.svm import LinearSVC >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from sklearn.base import clone >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, forest=ens.predict(X_val)) >>> diff = perf.difference() >>> diff.p_value() {'forest': np.float64(0.3)} """ values = [] sign = 1 if self.statistic_samples.BiB else -1 delta_best = self._delta_best() for k, v in self.statistic_samples.calls.items(): delta = 2 * sign * (delta_best - self.statistic[k]) if not isinstance(delta_best, np.ndarray): if right: values.append((k, (v >= delta).mean())) else: values.append((k, (v <= 0).mean())) else: if right: values.append((k, (v >= delta).mean(axis=0))) else: values.append((k, (v <= 0).mean(axis=0))) values.sort(key=lambda x: self.sorting_func(x[1])) return dict(values)
[docs] def dataframe(self, value_name:str='Score', var_name:str='Best', alg_legend:str='Algorithm', sig_legend:str='Significant', perf_names:str=None, right:bool=True, alpha:float=0.05): """Dataframe""" if perf_names is None and isinstance(self.best, np.ndarray): perf_names = [f'{alg}({k})' for k, alg in enumerate(self.best)] df = dataframe(self, value_name=value_name, var_name=var_name, alg_legend=alg_legend, perf_names=perf_names) df[sig_legend] = False if isinstance(self.best, str): for name, p in self.p_value(right=right).items(): if p >= alpha: continue df.loc[df[alg_legend] == name, sig_legend] = True else: p_values = self.p_value(right=right) systems = list(p_values.keys()) p_values = np.array([p_values[k] for k in systems]) for name, p_value in zip(perf_names, p_values.T): mask = df[var_name] == name for alg, p in zip(systems, p_value): if p >= alpha: continue _ = mask & (df[alg_legend] == alg) df.loc[_, sig_legend] = True return df
[docs] def plot(self, value_name:str='Difference', var_name:str='Best', alg_legend:str='Algorithm', sig_legend:str='Significant', perf_names:list=None, alpha:float=0.05, right:bool=True, kind:str='point', linestyle:str='none', col_wrap:int=3, capsize:float=0.2, set_refline:bool=True, **kwargs): """Plot >>> from sklearn.svm import LinearSVC >>> from sklearn.ensemble import RandomForestClassifier >>> from sklearn.datasets import load_iris >>> from sklearn.model_selection import train_test_split >>> from sklearn.base import clone >>> from CompStats.interface import Perf >>> X, y = load_iris(return_X_y=True) >>> _ = train_test_split(X, y, test_size=0.3) >>> X_train, X_val, y_train, y_val = _ >>> m = LinearSVC().fit(X_train, y_train) >>> hy = m.predict(X_val) >>> ens = RandomForestClassifier().fit(X_train, y_train) >>> perf = Perf(y_val, hy, forest=ens.predict(X_val)) >>> diff = perf.difference() >>> diff.plot() """ import seaborn as sns df = self.dataframe(value_name=value_name, var_name=var_name, alg_legend=alg_legend, sig_legend=sig_legend, perf_names=perf_names, alpha=alpha, right=right) title = var_name if var_name not in df.columns: var_name = None col_wrap = None ci = lambda x: measurements.CI(x, alpha=2*alpha) f_grid = sns.catplot(df, x=value_name, errorbar=ci, y=alg_legend, col=var_name, kind=kind, linestyle=linestyle, col_wrap=col_wrap, capsize=capsize, hue=sig_legend, **kwargs) if set_refline: f_grid.refline(x=0) if isinstance(self.best, str): f_grid.facet_axis(0, 0).set_title(f'{title} = {self.best}') return f_grid