Source code for mlresearch.active_learning._active_learning

from typing import Union
from copy import deepcopy
import numpy as np
from sklearn.base import ClassifierMixin, BaseEstimator, clone
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling.base import BaseOverSampler

from .base import BaseActiveLearner


def _random_initialization(self, X=None, y=None, initial_selection=None):
    """Randomly select an initial training dataset."""
    if initial_selection is not None:
        labeled_pool = initial_selection
    else:
        rng = np.random.RandomState(self.random_state)
        labeled_pool = np.zeros(shape=(X.shape[0])).astype(bool)
        ids = rng.choice(np.arange(X.shape[0]), self.n_init_, replace=False)
        if np.unique(y[ids]).shape[0] == 1:
            ids[-1] = rng.choice(
                np.arange(X.shape[0])[y != y[ids][0]], 1, replace=False
            ).squeeze()
        labeled_pool[ids] = True
    return labeled_pool


[docs] class StandardAL(BaseActiveLearner, ClassifierMixin): """ Standard Active Learning model with a random initial data selection Parameters ---------- classifier : classifier object, default=None Classifier or pipeline to be trained in the iterative process. If None, defaults to sklearn's RandomForestClassifier with default parameters and uses the ``random_state`` passed in the Active Learning model. acquisition_func : function or {'entropy', 'breaking_ties',\ 'random'}, default=None Method used to quantify the prediction's uncertainty level. All predefined functions are set up so that a higher value means higher uncertainty (higher likelihood of selection) and vice-versa. The uncertainty estimate is used to select the instances to be added to the labeled/training dataset. Acquisition functions may be added or changed in the ``UNCERTAINTY_FUNCTIONS`` dictionary. If None, defaults to "random". n_init : int or float, default=None Number of observations to include in the initial training dataset. If ``n_init`` < 1, then the corresponding percentage of the original dataset will be used as the initial training set. If None, defaults to 2% of the size of the original dataset. budget : int or float, default=None Number of observations to be added to the training dataset at each iteration. If ``budget`` < 1, then the corresponding percentage of the original dataset will be used as the initial training set. If None, defaults to 2% of the size of the original dataset. max_iter : int, default=None Maximum number of iterations allowed. If None, the experiment will run until 100% of the dataset is added to the training set. evaluation_metric : string, default='accuracy' Metric used to calculate the test scores. See ``research.metrics`` for info on available performance metrics. continue_training : bool, default=False If ``False``, fit a new classifier at each iteration. If ``True``, the classifier fitted in the previous iteration is used for further training in subsequent iterations. random_state : int, RandomState instance, default=None Control the randomization of the algorithm. - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number generator; - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. Attributes ---------- acquisition_func_ : function Method used to calculate the classification uncertainty at each iteration. evaluation_metric_ : scorer Metric used to estimate the performance of the AL classifier at each iteration. classifier_ : estimator object The classifier used in the iterative process. It is the classifier fitted in the last iteration. metadata_ : dict Contains the performance estimations, classifiers, labeled pool mask and original dataset. n_init_ : int Number of observations included in the initial training dataset. budget_ : int Number of observations to be added to the training set per iteration. Also known as budget. max_iter_ : int Maximum number of iterations allowed. labeled_pool_ : array-like of shape (n_samples,) Mask that filters the labeled observations from the original dataset. """ def _initialization(self, X=None, y=None, initial_selection=None): labeled_pool = _random_initialization(self, X, y, initial_selection) return labeled_pool def _iteration(self, X, y, **kwargs): self.classifier_.fit(X[self.labeled_pool_], y[self.labeled_pool_]) return self.classifier_.predict_proba(X[~self.labeled_pool_]) def _oracle(self, probabilities): uncertainties = self.acquisition_func_(probabilities) unlabeled_ids = np.argwhere(~self.labeled_pool_).squeeze() ids = ( unlabeled_ids[np.argsort(uncertainties)[::-1][: self.budget_]] if unlabeled_ids.ndim >= 1 else unlabeled_ids.flatten()[0] ) self.labeled_pool_[ids] = True return self
[docs] class AugmentationAL(BaseActiveLearner, ClassifierMixin): """ Active Learning with pipelined Data Augmentation. This method is implemented and analysed in a working paper. Parameters ---------- classifier : classifier object, default=None Classifier or pipeline to be trained in the iterative process. If None, defaults to sklearn's RandomForestClassifier with default parameters and uses the ``random_state`` passed in the Active Learning model. generator : generator estimator, default=None Generator to be used for artificial data generation within Active Learning iterations. param_grid : dict or list of dictionaries Used to optimize the classifier and generator hyperparameters at each iteration via cross-validated grid-search. If None, parameter tuning is skipped. Dictionary with parameters names (``str``) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. cv : int, cross-validation generator or an iterable, default=None Determines the cross-validation splitting strategy. Used to optimize the classifier and generator hyperparameters at each iteration. Possible inputs for cv are: - None, to use the default 5-fold cross validation, - integer, to specify the number of folds in a `(Stratified)KFold`, - `CV splitter`. For integer/None inputs, if the estimator is a classifier and ``y`` is either binary or multiclass, :class:`StratifiedKFold` is used. In all other cases, :class:`KFold` is used. These splitters are instantiated with `shuffle=False` so the splits will be the same across calls. acquisition_func : function or {'entropy', 'breaking_ties', 'random'}, default=None Method used to quantify the prediction's uncertainty level. All predefined functions are set up so that a higher value means higher uncertainty (higher likelihood of selection) and vice-versa. The uncertainty estimate is used to select the instances to be added to the labeled/training dataset. Acquisition functions may be added or changed in the ``UNCERTAINTY_FUNCTIONS`` dictionary. If None, defaults to "random". n_init : int or float, default=None Number of observations to include in the initial training dataset. If ``n_init`` < 1, then the corresponding percentage of the original dataset will be used as the initial training set. If None, defaults to 2% of the size of the original dataset. budget : int or float, default=None Number of observations to be added to the training dataset at each iteration. If ``budget`` < 1, then the corresponding percentage of the original dataset will be used as the initial training set. If None, defaults to 2% of the size of the original dataset. max_iter : int, default=None Maximum number of iterations allowed. If None, the experiment will run until 100% of the dataset is added to the training set. evaluation_metric : string, default='accuracy' Metric used to calculate the test scores. See ``mlresearch.metrics`` for info on available performance metrics. continue_training : bool, default=False If ``False``, fit a new classifier at each iteration. If ``True``, the classifier fitted in the previous iteration is used for further training in subsequent iterations. random_state : int, RandomState instance, default=None Control the randomization of the algorithm. - If int, ``random_state`` is the seed used by the random number generator; - If ``RandomState`` instance, random_state is the random number generator; - If ``None``, the random number generator is the ``RandomState`` instance used by ``np.random``. Attributes ---------- acquisition_func_ : function Method used to calculate the classification uncertainty at each iteration. evaluation_metric_ : scorer Metric used to estimate the performance of the AL classifier at each iteration. classifier_ : estimator object The classifier used in the iterative process. It is the classifier fitted in the last iteration. metadata_ : dict Contains the performance estimations, classifiers, labeled pool mask and original dataset. n_init_ : int Number of observations included in the initial training dataset. budget_ : int Number of observations to be added to the training set per iteration. max_iter_ : int Maximum number of iterations allowed. labeled_pool_ : array-like of shape (n_samples,) Mask that filters the labeled observations from the original dataset. """ def __init__( self, classifier: Union[BaseEstimator, ClassifierMixin] = None, generator: BaseOverSampler = None, param_grid: dict = None, cv=None, acquisition_func=None, n_init: Union[int, float] = None, budget: Union[int, float] = None, max_iter: int = None, evaluation_metric=None, continue_training: bool = False, random_state: int = None, ): super().__init__( classifier=classifier, acquisition_func=acquisition_func, n_init=n_init, budget=budget, max_iter=max_iter, evaluation_metric=evaluation_metric, continue_training=continue_training, random_state=random_state, ) self.generator = generator self.param_grid = param_grid self.cv = cv def _check(self, X, y): super()._check(X, y) # Generator if ( self.generator is not None and hasattr(self.generator, "random_state") and self.generator.random_state is None and self.random_state is not None ): # Check random state self._generator = clone(self.generator) self._generator.set_params(random_state=self.random_state) # Add generator to classifier as a pipeline generator = clone(self._generator) classifier = clone(self._classifier) self._classifier = Pipeline( [("generator", generator), ("classifier", classifier)] ) # Check if parameters in param_grid are valid if type(self.param_grid) is dict: for key in self.param_grid.keys(): if key not in self._classifier.get_params(): raise ValueError( f"Invalid parameter {key} for generator or classifier in {self} " "check the list of available parameters with " "`almodel._classifier.get_params().keys()`." ) elif self.param_grid is not None: raise TypeError( f"``param_grid`` must be a dict or None. Got {self.param_grid} instead." ) def _save_metadata(self, X, y, **kwargs): super()._save_metadata(X, y, **kwargs) if hasattr(self, "classifier_") and type(self.classifier_) is GridSearchCV: self.metadata_[self._current_iter]["parameters"] = { k: v for k, v in self.classifier_.best_estimator_.get_params().items() if k in self.param_grid.keys() } def _check_cross_validation(self, y): """Define cross-validation object""" min_frequency = np.unique(y, return_counts=True)[-1].min() cv = deepcopy(self.cv) if hasattr(self.cv, "n_splits"): cv.n_splits = min(min_frequency, cv.n_splits) elif type(self.cv) is int: cv = min(min_frequency, cv) elif cv is None: cv = min(min_frequency, 5) else: raise TypeError( "``cv`` object must be of type int or cross-validation generator. Got " f"{self.cv} instead" ) return cv def _initialization(self, X=None, y=None, initial_selection=None): labeled_pool = _random_initialization(self, X, y, initial_selection) return labeled_pool def _iteration(self, X, y, **kwargs): # Set up parameter tuning within iterations cv = self._check_cross_validation(y[self.labeled_pool_]) if self.param_grid is not None and cv != 1: self.classifier_ = GridSearchCV( estimator=self.classifier_, param_grid=self.param_grid, scoring=self.evaluation_metric_, cv=cv, refit=True, ) self.classifier_.fit(X[self.labeled_pool_], y[self.labeled_pool_]) return self.classifier_.predict_proba(X[~self.labeled_pool_]) def _oracle(self, probabilities): uncertainties = self.acquisition_func_(probabilities) unlabeled_ids = np.argwhere(~self.labeled_pool_).squeeze() ids = ( unlabeled_ids[np.argsort(uncertainties)[::-1][: self.budget_]] if unlabeled_ids.ndim >= 1 else unlabeled_ids.flatten()[0] ) self.labeled_pool_[ids] = True return self