Source code for CompStats.bootstrap

# Copyright 2024 Sergio Nava Muñoz and Mario Graff Guerrero

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Callable
from joblib import delayed, Parallel
from copy import copy
import numpy as np


[docs] class StatisticSamples: """Apply the statistic to `num_samples` samples taken with replacement from the population (arguments). :param statistic: Statistic. :type statistic: Callable :param num_samples: Number of bootstrap samples, default=500. :type num_samples: int :param n_jobs: Number of jobs to run in parallel, default=1. :type n_jobs: int >>> from CompStats import StatisticSamples >>> from sklearn.metrics import accuracy_score >>> import numpy as np >>> statistic = StatisticSamples(num_samples=10, statistic=np.mean) >>> empirical_distribution = np.r_[[3, 4, 5, 2, 4]] >>> statistic(empirical_distribution) array([2.8, 3.6, 3.6, 3.6, 2.6, 4. , 2.8, 3. , 3.8, 3.6]) >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] >>> pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] >>> acc = StatisticSamples(num_samples=15, statistic=accuracy_score) >>> acc(labels, pred) array([0.9, 0.8, 0.7, 1. , 0.6, 1. , 0.7, 0.9, 0.9, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8]) """
[docs] def __init__(self, statistic: Callable[[np.ndarray], float]=np.mean, num_samples: int=500, n_jobs: int=1, BiB: bool=True): self.statistic = statistic self.num_samples = num_samples self.n_jobs = n_jobs self.BiB = BiB # Guardar el parámetro BiB self._samples = None self._calls = {} self._info = {}
@property def info(self): """Information about the samples""" return self._info @info.setter def info(self, value): self._info = value
[docs] def get_params(self): """Parameters""" return dict(statistic=self.statistic, num_samples=self.num_samples, n_jobs=self.n_jobs, BiB=self.BiB) # Añadir BiB a los parámetros
def __sklearn_clone__(self): klass = self.__class__ params = self.get_params() ins = klass(**params) ins.info = copy(self.info) return ins @property def calls(self): """Dictionary containing the output of the calls when a name is given""" return self._calls @calls.setter def calls(self, value): self._calls = value @property def n_jobs(self): """Number of jobs to do in parallel""" return self._n_jobs @n_jobs.setter def n_jobs(self, value): self._n_jobs = value @property def statistic(self): """Statistic function.""" return self._statistic @statistic.setter def statistic(self, value): self._statistic = value @property def num_samples(self): """Number of bootstrap samples.""" return self._num_samples @num_samples.setter def num_samples(self, value): self._num_samples = value @property def statistic_samples(self): """It contains the statistic samples of the latest call.""" assert hasattr(self, '_statistic_samples') return self._statistic_samples @statistic_samples.setter def statistic_samples(self, value): self._statistic_samples = value
[docs] def samples(self, N): """Samples. :param N: Population size. :type N: int """ def inner(N): _ = np.random.randint(N, size=(self.num_samples, N)) self._samples = _ return self._samples try: if self._samples.shape[1] == N: return self._samples else: return inner(N) except AttributeError: return inner(N)
[docs] def keys(self): """calls keys""" return self.calls.keys()
def __getitem__(self, key): return self.calls[key] def __call__(self, *args: np.ndarray, name=None) -> np.ndarray: """Population where the bootstrap process will be performed. :param *args: Population :type *args: np.ndarray """ def inner(s): _ = [arg[s] for arg in args] return self.statistic(*_) N = args[0].shape[0] B = Parallel(n_jobs=self.n_jobs)(delayed(inner)(s) for s in self.samples(N)) self.statistic_samples = np.array(B) if name is not None: self.calls[name] = self.statistic_samples return self.statistic_samples
[docs] def melt(self, var_name='Algorithm', value_name='Score'): """Represent into a long DataFrame""" import pandas as pd return pd.DataFrame(self.calls).melt(var_name=var_name, value_name=value_name)
# class CI(StatisticSamples): # """Compute the Confidence Interval of a statistic using bootstrap. # :param alpha: :math:`[\\frac{\\alpha}{2}, 1 - \\frac{\\alpha}{2}]`. # :type alpha: float # >>> from IngeoML import CI # >>> from sklearn.metrics import accuracy_score # >>> import numpy as np # >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] # >>> pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] # >>> acc = CI(statistic=accuracy_score) # >>> acc(labels, pred) # (0.7, 1.0) # """ # def __init__(self, alpha: float=0.05, # **kwargs): # super().__init__(**kwargs) # self.alpha = alpha # @property # def alpha(self): # """The interval is computed for :math:`[\\frac{\\alpha}{2}, 1 - \\frac{\\alpha}{2}]`. # """ # return self._alpha # @alpha.setter # def alpha(self, value): # self._alpha = value / 2 # def __call__(self, *args: np.ndarray) -> np.ndarray: # B = super().__call__(*args) # alpha = self.alpha # return (np.percentile(B, alpha * 100, axis=0), # np.percentile(B, (1 - alpha) * 100, axis=0)) # class SE(StatisticSamples): # """Compute the Standard Error of a statistic using bootstrap. # >>> from IngeoML import SE # >>> from sklearn.metrics import accuracy_score # >>> import numpy as np # >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]] # >>> pred = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]] # >>> se = SE(statistic=accuracy_score) # >>> se(labels, pred) # 0.11949493713124419 # """ # def __call__(self, *args: np.ndarray) -> float: # B = super().__call__(*args) # return np.std(B, axis=0) # class Difference(CI): # def __init__(self, y: np.ndarray, # algorithms: dict={}, # performance: Callable[[np.ndarray, np.ndarray], float]=lambda y, hy: f1_score(y, hy, average='macro'), # **kwargs) -> None: # super(Difference, self).__init__(populations=algorithms, statistic=performance) # self.y = y # self._dist = dict() # self._delta = dict() # self._pvalue_r = dict() # self._pvalue_l = dict() # @property # def y(self): # return self._y # @y.setter # def y(self, value): # self._y = value # @property # def best(self): # try: # return self._best # except AttributeError: # y = self.y # best = (None, -np.inf) # for k, v in self.populations.items(): # perf = self.statistic(y, v) # if perf > best[1]: # best = (k, perf) # self._best = best[0] # return self._best # def delta(self, key): # assert key != self.best # if key in self._delta: # return self._delta[key] # y = self.y # algs = self.populations # perf = self.statistic # delta = perf(y, algs[self.best]) - perf(y, algs[key]) # self._delta[key] = delta # return delta # def samples(self, key): # if key in self.statistic_samples: # return self.statistic_samples[key] # data = self.populations[key] # y = self.y # output = np.array([self.statistic(y[s], data[s]) # for s in self.bootstrap]) # self.statistic_samples[key] = output # return output # @property # def best_performance(self): # return self.samples(self.best) # def distribution(self, key): # best = self.best # assert key != best # if key in self._dist: # return self._dist[key] # output = self.best_performance - self.samples(key) # self._dist[key] = output # return output # def pvalue(self, key, side='right'): # assert side in ['left', 'right'] # assert key != self.best # if side == 'right': # if key in self._pvalue_r: # return self._pvalue_r[key] # elif key in self._pvalue_l: # return self._pvalue_l[key] # c = 0 # delta_2 = 2 * self.delta(key) # delta_i = self.distribution(key) # if side == 'right': # c = (delta_i >= delta_2).mean() # else: # c = (delta_i < 0).mean() # if side == 'right': # self._pvalue_r[key] = c # else: # self._pvalue_l[key] = c # return c # def sort(self, side='right'): # best = self.best # algs = [(k, self.pvalue(k, side=side)) # for k in self.populations if k != best] # algs.sort(key=lambda x: x[1], reverse=True) # return [k for k, _ in algs]