Source code for CompStats.bootstrap

# Copyright 2024 Sergio Nava Muñoz and Mario Graff Guerrero

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Callable
from joblib import delayed, Parallel
from copy import copy
import numpy as np



[docs]
class StatisticSamples:
    """Apply the statistic to `num_samples` samples taken with replacement 
    from the population (arguments).

    :param statistic: Statistic.
    :type statistic: Callable
    :param num_samples: Number of bootstrap samples, default=500.
    :type num_samples: int
    :param n_jobs: Number of jobs to run in parallel, default=1.
    :type n_jobs: int


    >>> from CompStats import StatisticSamples
    >>> from sklearn.metrics import accuracy_score
    >>> import numpy as np
    >>> statistic = StatisticSamples(num_samples=10, statistic=np.mean)
    >>> empirical_distribution = np.r_[[3, 4, 5, 2, 4]]
    >>> statistic(empirical_distribution)
    array([2.8, 3.6, 3.6, 3.6, 2.6, 4. , 2.8, 3. , 3.8, 3.6])
    >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]
    >>> pred   = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]]
    >>> acc = StatisticSamples(num_samples=15, statistic=accuracy_score)
    >>> acc(labels, pred)
    array([0.9, 0.8, 0.7, 1. , 0.6, 1. , 0.7, 0.9, 0.9, 0.8, 0.9, 0.8, 0.8, 0.8, 0.8])
    """


[docs]
    def __init__(self,
                 statistic: Callable[[np.ndarray], float]=np.mean,
                 num_samples: int=500,
                 n_jobs: int=1,
                 BiB: bool=True):
        self.statistic = statistic
        self.num_samples = num_samples
        self.n_jobs = n_jobs
        self.BiB = BiB  # Guardar el parámetro BiB        
        self._samples = None
        self._calls = {}
        self._info = {}


    @property
    def info(self):
        """Information about the samples"""
        return self._info
    
    @info.setter
    def info(self, value):
        self._info = value


[docs]
    def get_params(self):
        """Parameters"""
        return dict(statistic=self.statistic,
                    num_samples=self.num_samples,
                    n_jobs=self.n_jobs,
                    BiB=self.BiB)  # Añadir BiB a los parámetros


    def __sklearn_clone__(self):
        klass = self.__class__
        params = self.get_params()
        ins = klass(**params)
        ins.info = copy(self.info)
        return ins

    @property
    def calls(self):
        """Dictionary containing the output of the calls when a name is given"""
        return self._calls
    
    @calls.setter
    def calls(self, value):
        self._calls = value

    @property
    def n_jobs(self):
        """Number of jobs to do in parallel"""
        return self._n_jobs

    @n_jobs.setter
    def n_jobs(self, value):
        self._n_jobs = value

    @property
    def statistic(self):
        """Statistic function."""
        return self._statistic

    @statistic.setter
    def statistic(self, value):
        self._statistic = value

    @property
    def num_samples(self):
        """Number of bootstrap samples."""
        return self._num_samples

    @num_samples.setter
    def num_samples(self, value):
        self._num_samples = value

    @property
    def statistic_samples(self):
        """It contains the statistic samples of the latest call."""
        assert hasattr(self, '_statistic_samples')
        return self._statistic_samples

    @statistic_samples.setter
    def statistic_samples(self, value):
        self._statistic_samples = value


[docs]
    def samples(self, N):
        """Samples.
        
        :param N: Population size.
        :type N: int
        """
        def inner(N):
            _ = np.random.randint(N, size=(self.num_samples, N))
            self._samples = _
            return self._samples
        try:
            if self._samples.shape[1] == N:
                return self._samples
            else:
                return inner(N)
        except AttributeError:
            return inner(N)

        

[docs]
    def keys(self):
        """calls keys"""
        return self.calls.keys()


    def __getitem__(self, key):
        return self.calls[key]

    def __call__(self, *args: np.ndarray, name=None) -> np.ndarray:
        """Population where the bootstrap process will be performed. 

        :param *args: Population
        :type *args: np.ndarray
        """
        def inner(s):
            _ = [arg[s] for arg in args]
            return self.statistic(*_)

        N = args[0].shape[0]
        B = Parallel(n_jobs=self.n_jobs)(delayed(inner)(s)
                                         for s in self.samples(N))
        self.statistic_samples = np.array(B)
        if name is not None:
            self.calls[name] = self.statistic_samples
        return self.statistic_samples


[docs]
    def melt(self, var_name='Algorithm', value_name='Score'):
        """Represent into a long DataFrame"""
        import pandas as pd

        return pd.DataFrame(self.calls).melt(var_name=var_name,
                                             value_name=value_name)





# class CI(StatisticSamples):
#     """Compute the Confidence Interval of a statistic using bootstrap.
    
#     :param alpha: :math:`[\\frac{\\alpha}{2}, 1 - \\frac{\\alpha}{2}]`. 
#     :type alpha: float

#     >>> from IngeoML import CI
#     >>> from sklearn.metrics import accuracy_score
#     >>> import numpy as np    
#     >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]
#     >>> pred   = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]]
#     >>> acc = CI(statistic=accuracy_score)
#     >>> acc(labels, pred)
#     (0.7, 1.0)
#     """
#     def __init__(self, alpha: float=0.05,
#                  **kwargs):
#         super().__init__(**kwargs)
#         self.alpha = alpha

#     @property
#     def alpha(self):
#         """The interval is computed for :math:`[\\frac{\\alpha}{2}, 1 - \\frac{\\alpha}{2}]`.
#         """
#         return self._alpha
    
#     @alpha.setter
#     def alpha(self, value):
#         self._alpha = value / 2

#     def __call__(self, *args: np.ndarray) -> np.ndarray:
#         B =  super().__call__(*args)
#         alpha  = self.alpha  
#         return (np.percentile(B, alpha * 100, axis=0), 
#                 np.percentile(B, (1 - alpha) * 100, axis=0))
    

# class SE(StatisticSamples):
#     """Compute the Standard Error of a statistic using bootstrap.

#     >>> from IngeoML import SE
#     >>> from sklearn.metrics import accuracy_score
#     >>> import numpy as np    
#     >>> labels = np.r_[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]
#     >>> pred   = np.r_[[0, 0, 1, 0, 0, 1, 1, 1, 0, 1]]
#     >>> se = SE(statistic=accuracy_score)
#     >>> se(labels, pred)
#     0.11949493713124419
#     """

#     def __call__(self, *args: np.ndarray) -> float:
#         B =  super().__call__(*args)
#         return np.std(B, axis=0)


# class Difference(CI):
#     def __init__(self, y: np.ndarray, 
#                  algorithms: dict={}, 
#                  performance: Callable[[np.ndarray, np.ndarray], float]=lambda y, hy: f1_score(y, hy, average='macro'),
#                  **kwargs) -> None:
#         super(Difference, self).__init__(populations=algorithms, statistic=performance)
#         self.y = y
#         self._dist = dict()
#         self._delta = dict()
#         self._pvalue_r = dict()
#         self._pvalue_l = dict()

#     @property
#     def y(self):
#         return self._y
    
#     @y.setter
#     def y(self, value):
#         self._y = value

#     @property
#     def best(self):
#         try:
#             return self._best
#         except AttributeError:
#             y = self.y
#             best = (None, -np.inf)
#             for k, v in self.populations.items():
#                 perf = self.statistic(y, v)
#                 if perf > best[1]:
#                     best = (k, perf)
#             self._best = best[0]
#             return self._best

#     def delta(self, key):
#         assert key != self.best
#         if key in self._delta:
#             return self._delta[key]
#         y = self.y
#         algs = self.populations
#         perf = self.statistic
#         delta = perf(y, algs[self.best]) - perf(y, algs[key])
#         self._delta[key] = delta
#         return delta
    
#     def samples(self, key):
#         if key in self.statistic_samples:
#             return self.statistic_samples[key]
#         data = self.populations[key]
#         y = self.y
#         output = np.array([self.statistic(y[s], data[s])
#                            for s in self.bootstrap])
#         self.statistic_samples[key] = output
#         return output    
    
#     @property
#     def best_performance(self):
#         return self.samples(self.best)
        
#     def distribution(self, key):
#         best = self.best
#         assert key != best
#         if key in self._dist:
#             return self._dist[key]
#         output = self.best_performance - self.samples(key)
#         self._dist[key] = output
#         return output

#     def pvalue(self, key, side='right'):
#         assert side in ['left', 'right']
#         assert key != self.best
#         if side == 'right':
#             if key in self._pvalue_r:
#                 return self._pvalue_r[key]
#         elif key in self._pvalue_l:
#             return self._pvalue_l[key]
#         c = 0
#         delta_2 = 2 * self.delta(key)
#         delta_i = self.distribution(key)
#         if side == 'right':
#             c = (delta_i >= delta_2).mean()
#         else:
#             c = (delta_i < 0).mean()
#         if side == 'right':
#             self._pvalue_r[key] = c
#         else:
#             self._pvalue_l[key] = c
#         return c
    
#     def sort(self, side='right'):
#         best = self.best
#         algs = [(k, self.pvalue(k, side=side))
#                 for k in self.populations if k != best]
#         algs.sort(key=lambda x: x[1], reverse=True)
#         return [k for k, _ in algs]