# Copyright 2025 Sergio Nava Muñoz and Mario Graff Guerrero
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from dataclasses import dataclass
from sklearn.metrics import balanced_accuracy_score
from sklearn.base import clone
import pandas as pd
import numpy as np
from CompStats.bootstrap import StatisticSamples
from CompStats.utils import progress_bar
from CompStats import measurements
from CompStats.measurements import SE
from CompStats.utils import dataframe
[docs]
class Perf(object):
"""Perf is an entry point to CompStats
:param y_true: True measurement or could be a pandas.DataFrame where column label 'y' corresponds to the true measurement.
:type y_true: numpy.ndarray or pandas.DataFrame
:param score_func: Function to measure the performance, it is assumed that the best algorithm has the highest value.
:type score_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.`
:param error_func: Function to measure the performance where the best algorithm has the lowest value.
:type error_func: Function where the first argument is :math:`y` and the second is :math:`\\hat{y}.`
:param y_pred: Predictions, the algorithms will be identified with alg-k where k=1 is the first argument included in :py:attr:`args.`
:type y_pred: numpy.ndarray
:param kwargs: Predictions, the algorithms will be identified using the keyword
:type kwargs: numpy.ndarray
:param n_jobs: Number of jobs to compute the statistic, default=-1 corresponding to use all threads.
:type n_jobs: int
:param num_samples: Number of bootstrap samples, default=500.
:type num_samples: int
:param use_tqdm: Whether to use tqdm.tqdm to visualize the progress, default=True.
:type use_tqdm: bool
>>> from sklearn.svm import LinearSVC
>>> from sklearn.linear_model import LogisticRegression
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.base import clone
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, forest=ens.predict(X_val))
>>> perf
<Perf>
Statistic with its standard error (se)
statistic (se)
0.9792 (0.0221) <= alg-1
0.9744 (0.0246) <= forest
If an algorithm's prediction is missing, this can be included by calling the instance, as can be seen in the following instruction. Note that the algorithm's name can also be given with the keyword :py:attr:`name.`
>>> lr = LogisticRegression().fit(X_train, y_train)
>>> perf(lr.predict(X_val), name='Log. Reg.')
<Perf>
Statistic with its standard error (se)
statistic (se)
1.0000 (0.0000) <= Log. Reg.
0.9792 (0.0221) <= alg-1
0.9744 (0.0246) <= forest
The performance function used to compare the algorithms can be changed, and the same bootstrap samples would be used if the instance were cloned. Consequently, the values are computed using the same samples, as can be seen in the following example.
>>> perf_error = clone(perf)
>>> perf_error.error_func = lambda y, hy: (y != hy).mean()
>>> perf_error
<Perf>
Statistic with its standard error (se)
statistic (se)
0.0000 (0.0000) <= Log. Reg.
0.0222 (0.0237) <= alg-1
0.0222 (0.0215) <= forest
"""
[docs]
def __init__(self, y_true, *y_pred,
score_func=balanced_accuracy_score,
error_func=None,
num_samples: int=500,
n_jobs: int=-1,
use_tqdm=True,
**kwargs):
assert (score_func is None) ^ (error_func is None)
self.score_func = score_func
self.error_func = error_func
algs = {}
for k, v in enumerate(y_pred):
algs[f'alg-{k+1}'] = np.asanyarray(v)
algs.update(**kwargs)
self.predictions = algs
self.y_true = y_true
self.num_samples = num_samples
self.n_jobs = n_jobs
self.use_tqdm = use_tqdm
self.sorting_func = np.linalg.norm
self._init()
def _init(self):
"""Compute the bootstrap statistic"""
bib = True if self.score_func is not None else False
if hasattr(self, '_statistic_samples'):
_ = self.statistic_samples
_.BiB = bib
else:
_ = StatisticSamples(statistic=self.statistic_func,
n_jobs=self.n_jobs,
num_samples=self.num_samples,
BiB=bib)
_.samples(N=self.y_true.shape[0])
self.statistic_samples = _
[docs]
def get_params(self):
"""Parameters"""
return dict(y_true=self.y_true,
score_func=self.score_func,
error_func=self.error_func,
num_samples=self.num_samples,
n_jobs=self.n_jobs)
def __sklearn_clone__(self):
klass = self.__class__
params = self.get_params()
ins = klass(**params)
ins.predictions = dict(self.predictions)
ins._statistic_samples._samples = self.statistic_samples._samples
ins.sorting_func = self.sorting_func
return ins
def __repr__(self):
"""Prediction statistics with standard error in parenthesis"""
arg = 'score_func' if self.error_func is None else 'error_func'
func_name = self.statistic_func.__name__
statistic = self.statistic
if isinstance(statistic, dict):
return f"<{self.__class__.__name__}({arg}={func_name})>\n{self}"
elif isinstance(statistic, float):
return f"<{self.__class__.__name__}({arg}={func_name}, statistic={statistic:0.4f}, se={self.se:0.4f})>"
desc = [f'{k:0.4f}' for k in statistic]
desc = ', '.join(desc)
desc_se = [f'{k:0.4f}' for k in self.se]
desc_se = ', '.join(desc_se)
return f"<{self.__class__.__name__}({arg}={func_name}, statistic=[{desc}], se=[{desc_se}])>"
def __str__(self):
"""Prediction statistics with standard error in parenthesis"""
if not isinstance(self.statistic, dict):
return self.__repr__()
se = self.se
output = ["Statistic with its standard error (se)"]
output.append("statistic (se)")
for key, value in self.statistic.items():
if isinstance(value, float):
desc = f'{value:0.4f} ({se[key]:0.4f}) <= {key}'
else:
desc = [f'{v:0.4f} ({k:0.4f})'
for v, k in zip(value, se[key])]
desc = ', '.join(desc)
desc = f'{desc} <= {key}'
output.append(desc)
return "\n".join(output)
def __call__(self, y_pred, name=None):
"""Add predictions"""
if name is None:
k = len(self.predictions) + 1
if k == 0:
k = 1
name = f'alg-{k}'
self.best = None
self.predictions[name] = np.asanyarray(y_pred)
samples = self._statistic_samples
calls = samples.calls
if name in calls:
del calls[name]
return self
[docs]
def difference(self, wrt: str=None):
"""Compute the difference w.r.t any algorithm by default is the best
>>> from sklearn.svm import LinearSVC
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.base import clone
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, forest=ens.predict(X_val))
>>> perf.difference()
<Difference>
difference p-values w.r.t alg-1
forest 0.06
"""
if wrt is None:
wrt = self.best
if isinstance(wrt, str):
base = self.statistic_samples.calls[wrt]
else:
base = np.array([self.statistic_samples.calls[key][:, col]
for col, key in enumerate(wrt)]).T
sign = 1 if self.statistic_samples.BiB else -1
diff = dict()
for k, v in self.statistic_samples.calls.items():
if base.ndim == 1 and k == wrt:
continue
diff[k] = sign * (base - v)
diff_ins = Difference(statistic_samples=clone(self.statistic_samples),
statistic=self.statistic)
diff_ins.sorting_func = self.sorting_func
diff_ins.statistic_samples.calls = diff
diff_ins.statistic_samples.info['best'] = self.best
diff_ins.best = self.best
return diff_ins
@property
def best(self):
"""System with best performance"""
if hasattr(self, '_best') and self._best is not None:
return self._best
if not isinstance(self.statistic, dict):
key, value = list(self.statistic_samples.calls.items())[0]
if value.ndim == 1:
self._best = key
else:
self._best = np.array([key] * value.shape[1])
return self._best
BiB = bool(self.statistic_samples.BiB)
keys = np.array(list(self.statistic.keys()))
data = np.asanyarray([self.statistic[k]
for k in keys])
if isinstance(self.statistic[keys[0]], np.ndarray):
if BiB:
best = data.argmax(axis=0)
else:
best = data.argmin(axis=0)
else:
if BiB:
best = data.argmax()
else:
best = data.argmin()
self._best = keys[best]
return self._best
@best.setter
def best(self, value):
self._best = value
@property
def sorting_func(self):
"""Rank systems when multiple performances are used"""
return self._sorting_func
@sorting_func.setter
def sorting_func(self, value):
self._sorting_func = value
@property
def statistic(self):
"""Statistic
>>> from sklearn.svm import LinearSVC
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, forest=ens.predict(X_val))
>>> perf.statistic
{'alg-1': 1.0, 'forest': 0.9500891265597148}
"""
data = sorted([(k, self.statistic_func(self.y_true, v))
for k, v in self.predictions.items()],
key=lambda x: self.sorting_func(x[1]),
reverse=self.statistic_samples.BiB)
if len(data) == 1:
return data[0][1]
return dict(data)
@property
def se(self):
"""Standard Error
>>> from sklearn.svm import LinearSVC
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, forest=ens.predict(X_val))
>>> perf.se
{'alg-1': 0.0, 'forest': 0.026945730782184187}
"""
output = SE(self.statistic_samples)
if len(output) == 1:
return list(output.values())[0]
return output
[docs]
def plot(self, value_name:str=None,
var_name:str='Performance',
alg_legend:str='Algorithm',
perf_names:list=None,
CI:float=0.05,
kind:str='point', linestyle:str='none',
col_wrap:int=3, capsize:float=0.2,
comparison:bool=True,
right:bool=True,
comp_legend:str='Comparison',
winner_legend:str='Best',
tie_legend:str='Equivalent',
loser_legend:str='Different',
**kwargs):
"""plot with seaborn
>>> from sklearn.svm import LinearSVC
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, score_func=None,
error_func=lambda y, hy: (y != hy).mean(),
forest=ens.predict(X_val))
>>> perf.plot()
"""
import seaborn as sns
if value_name is None:
if self.score_func is not None:
value_name = 'Score'
else:
value_name = 'Error'
if not isinstance(self.statistic, dict):
comparison = False
best = self.best
if isinstance(best, np.ndarray):
if best.shape[0] < col_wrap:
col_wrap = best.shape[0]
df = self.dataframe(value_name=value_name, var_name=var_name,
alg_legend=alg_legend, perf_names=perf_names,
comparison=comparison, alpha=CI, right=right,
comp_legend=comp_legend,
winner_legend=winner_legend,
tie_legend=tie_legend,
loser_legend=loser_legend)
if var_name not in df.columns:
var_name = None
col_wrap = None
ci = lambda x: measurements.CI(x, alpha=CI)
if comparison:
kwargs.update(dict(hue=comp_legend))
f_grid = sns.catplot(df, x=value_name, errorbar=ci,
y=alg_legend, col=var_name,
kind=kind, linestyle=linestyle,
col_wrap=col_wrap, capsize=capsize, **kwargs)
return f_grid
[docs]
def dataframe(self, comparison:bool=False,
right:bool=True,
alpha:float=0.05,
value_name:str='Score',
var_name:str='Performance',
alg_legend:str='Algorithm',
comp_legend:str='Comparison',
winner_legend:str='Best',
tie_legend:str='Equivalent',
loser_legend:str='Different',
perf_names:str=None):
"""Dataframe"""
if perf_names is None and isinstance(self.best, np.ndarray):
func_name = self.statistic_func.__name__
perf_names = [f'{func_name}({i})'
for i, k in enumerate(self.best)]
df = dataframe(self, value_name=value_name,
var_name=var_name,
alg_legend=alg_legend,
perf_names=perf_names)
if not comparison:
return df
df[comp_legend] = tie_legend
diff = self.difference()
best = self.best
if isinstance(best, str):
for name, p in diff.p_value(right=right).items():
if p >= alpha:
continue
df.loc[df[alg_legend] == name, comp_legend] = loser_legend
df.loc[df[alg_legend] == best, comp_legend] = winner_legend
else:
p_values = diff.p_value(right=right)
systems = list(p_values.keys())
p_values = np.array([p_values[k] for k in systems])
for name, p_value, winner in zip(perf_names,
p_values.T,
best):
mask = df[var_name] == name
for alg, p in zip(systems, p_value):
if p >= alpha and winner != alg:
continue
_ = mask & (df[alg_legend] == alg)
if winner == alg:
df.loc[_, comp_legend] = winner_legend
else:
df.loc[_, comp_legend] = loser_legend
return df
@property
def n_jobs(self):
"""Number of jobs to compute the statistics"""
return self._n_jobs
@n_jobs.setter
def n_jobs(self, value):
self._n_jobs = value
@property
def statistic_func(self):
"""Statistic function"""
if self.score_func is not None:
return self.score_func
return self.error_func
@property
def statistic_samples(self):
"""Statistic Samples"""
samples = self._statistic_samples
algs = set(samples.calls.keys())
algs = set(self.predictions.keys()) - algs
if len(algs):
for key in progress_bar(algs, use_tqdm=self.use_tqdm):
samples(self.y_true, self.predictions[key], name=key)
return self._statistic_samples
@statistic_samples.setter
def statistic_samples(self, value):
self._statistic_samples = value
@property
def num_samples(self):
"""Number of bootstrap samples"""
return self._num_samples
@num_samples.setter
def num_samples(self, value):
self._num_samples = value
@property
def predictions(self):
"""Predictions"""
return self._predictions
@predictions.setter
def predictions(self, value):
self._predictions = value
@property
def y_true(self):
"""True output, gold standard o :math:`y`"""
return self._y_true
@y_true.setter
def y_true(self, value):
if isinstance(value, pd.DataFrame):
self._y_true = value['y'].to_numpy()
algs = {}
for c in value.columns:
if c == 'y':
continue
algs[c] = value[c].to_numpy()
self.predictions.update(algs)
return
self._y_true = value
@property
def score_func(self):
"""Score function"""
return self._score_func
@score_func.setter
def score_func(self, value):
self._score_func = value
if value is not None:
self.error_func = None
if hasattr(self, '_statistic_samples'):
self._statistic_samples.statistic = value
self._statistic_samples.BiB = True
@property
def error_func(self):
"""Error function"""
return self._error_func
@error_func.setter
def error_func(self, value):
self._error_func = value
if value is not None:
self.score_func = None
if hasattr(self, '_statistic_samples'):
self._statistic_samples.statistic = value
self._statistic_samples.BiB = False
[docs]
@dataclass
class Difference:
"""Difference
>>> from sklearn.svm import LinearSVC
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.base import clone
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, forest=ens.predict(X_val))
>>> diff = perf.difference()
>>> diff
<Difference>
difference p-values w.r.t alg-1
0.0780 <= forest
"""
statistic_samples:StatisticSamples=None
statistic:dict=None
best:str=None
@property
def sorting_func(self):
"""Rank systems when multiple performances are used"""
return self._sorting_func
@sorting_func.setter
def sorting_func(self, value):
self._sorting_func = value
def __repr__(self):
"""p-value"""
return f"<{self.__class__.__name__}>\n{self}"
def __str__(self):
"""p-value"""
if isinstance(self.best, str):
best = f' w.r.t {self.best}'
else:
best = ''
output = [f"difference p-values {best}"]
best = self.best
if isinstance(best, np.ndarray):
desc = ', '.join(best)
output.append(f'{desc} <= Best')
for key, value in self.p_value().items():
if isinstance(value, float):
output.append(f'{value:0.4f} <= {key}')
else:
desc = [f'{v:0.4f}' for v in value]
desc = ', '.join(desc)
desc = f'{desc} <= {key}'
output.append(desc)
return "\n".join(output)
def _delta_best(self):
"""Compute multiple delta"""
if isinstance(self.best, str):
return self.statistic[self.best]
keys = np.unique(self.best)
statistic = np.array([self.statistic[k]
for k in keys])
m = {v: k for k, v in enumerate(keys)}
best = np.array([m[x] for x in self.best])
return statistic[best, np.arange(best.shape[0])]
[docs]
def p_value(self, right:bool=True):
"""Compute p_value of the differences
:param right: Estimate the p-value using :math:`\\text{sample} \\geq 2\\delta`
:type right: bool
>>> from sklearn.svm import LinearSVC
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.base import clone
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, forest=ens.predict(X_val))
>>> diff = perf.difference()
>>> diff.p_value()
{'forest': np.float64(0.3)}
"""
values = []
sign = 1 if self.statistic_samples.BiB else -1
delta_best = self._delta_best()
for k, v in self.statistic_samples.calls.items():
delta = 2 * sign * (delta_best - self.statistic[k])
if not isinstance(delta_best, np.ndarray):
if right:
values.append((k, (v >= delta).mean()))
else:
values.append((k, (v <= 0).mean()))
else:
if right:
values.append((k, (v >= delta).mean(axis=0)))
else:
values.append((k, (v <= 0).mean(axis=0)))
values.sort(key=lambda x: self.sorting_func(x[1]))
return dict(values)
[docs]
def dataframe(self, value_name:str='Score',
var_name:str='Best',
alg_legend:str='Algorithm',
sig_legend:str='Significant',
perf_names:str=None,
right:bool=True,
alpha:float=0.05):
"""Dataframe"""
if perf_names is None and isinstance(self.best, np.ndarray):
perf_names = [f'{alg}({k})'
for k, alg in enumerate(self.best)]
df = dataframe(self, value_name=value_name,
var_name=var_name,
alg_legend=alg_legend,
perf_names=perf_names)
df[sig_legend] = False
if isinstance(self.best, str):
for name, p in self.p_value(right=right).items():
if p >= alpha:
continue
df.loc[df[alg_legend] == name, sig_legend] = True
else:
p_values = self.p_value(right=right)
systems = list(p_values.keys())
p_values = np.array([p_values[k] for k in systems])
for name, p_value in zip(perf_names, p_values.T):
mask = df[var_name] == name
for alg, p in zip(systems, p_value):
if p >= alpha:
continue
_ = mask & (df[alg_legend] == alg)
df.loc[_, sig_legend] = True
return df
[docs]
def plot(self, value_name:str='Difference',
var_name:str='Best',
alg_legend:str='Algorithm',
sig_legend:str='Significant',
perf_names:list=None,
alpha:float=0.05,
right:bool=True,
kind:str='point', linestyle:str='none',
col_wrap:int=3, capsize:float=0.2,
set_refline:bool=True,
**kwargs):
"""Plot
>>> from sklearn.svm import LinearSVC
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.datasets import load_iris
>>> from sklearn.model_selection import train_test_split
>>> from sklearn.base import clone
>>> from CompStats.interface import Perf
>>> X, y = load_iris(return_X_y=True)
>>> _ = train_test_split(X, y, test_size=0.3)
>>> X_train, X_val, y_train, y_val = _
>>> m = LinearSVC().fit(X_train, y_train)
>>> hy = m.predict(X_val)
>>> ens = RandomForestClassifier().fit(X_train, y_train)
>>> perf = Perf(y_val, hy, forest=ens.predict(X_val))
>>> diff = perf.difference()
>>> diff.plot()
"""
import seaborn as sns
df = self.dataframe(value_name=value_name,
var_name=var_name,
alg_legend=alg_legend,
sig_legend=sig_legend,
perf_names=perf_names,
alpha=alpha, right=right)
title = var_name
if var_name not in df.columns:
var_name = None
col_wrap = None
ci = lambda x: measurements.CI(x, alpha=2*alpha)
f_grid = sns.catplot(df, x=value_name, errorbar=ci,
y=alg_legend, col=var_name,
kind=kind, linestyle=linestyle,
col_wrap=col_wrap, capsize=capsize,
hue=sig_legend,
**kwargs)
if set_refline:
f_grid.refline(x=0)
if isinstance(self.best, str):
f_grid.facet_axis(0, 0).set_title(f'{title} = {self.best}')
return f_grid