Source code for mlresearch.active_learning._active_learning

from typing import Union
from copy import deepcopy
import numpy as np
from sklearn.base import ClassifierMixin, BaseEstimator, clone
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling.base import BaseOverSampler

from .base import BaseActiveLearner


def _random_initialization(self, X=None, y=None, initial_selection=None):
    """Randomly select an initial training dataset."""
    if initial_selection is not None:
        labeled_pool = initial_selection
    else:
        rng = np.random.RandomState(self.random_state)
        labeled_pool = np.zeros(shape=(X.shape[0])).astype(bool)
        ids = rng.choice(np.arange(X.shape[0]), self.n_init_, replace=False)
        if np.unique(y[ids]).shape[0] == 1:
            ids[-1] = rng.choice(
                np.arange(X.shape[0])[y != y[ids][0]], 1, replace=False
            ).squeeze()
        labeled_pool[ids] = True
    return labeled_pool



[docs]
class StandardAL(BaseActiveLearner, ClassifierMixin):
    """
    Standard Active Learning model with a random initial data selection

    Parameters
    ----------
    classifier : classifier object, default=None
        Classifier or pipeline to be trained in the iterative process. If None, defaults
        to sklearn's RandomForestClassifier with default parameters and uses the
        ``random_state`` passed in the Active Learning model.

    acquisition_func : function or {'entropy', 'breaking_ties',\
        'random'}, default=None
        Method used to quantify the prediction's uncertainty level. All predefined
        functions are set up so that a higher value means higher uncertainty (higher
        likelihood of selection) and vice-versa. The uncertainty estimate is used to
        select the instances to be added to the labeled/training dataset. Acquisition
        functions may be added or changed in the ``UNCERTAINTY_FUNCTIONS`` dictionary.
        If None, defaults to "random".

    n_init : int or float, default=None
        Number of observations to include in the initial training dataset. If
        ``n_init`` < 1, then the corresponding percentage of the original dataset
        will be used as the initial training set. If None, defaults to 2% of the size of
        the original dataset.

    budget : int or float, default=None
        Number of observations to be added to the training dataset at each iteration. If
        ``budget`` < 1, then the corresponding percentage of the original dataset will be
        used as the initial training set. If None, defaults to 2% of the size of the
        original dataset.

    max_iter : int, default=None
        Maximum number of iterations allowed. If None, the experiment will run until 100%
        of the dataset is added to the training set.

    evaluation_metric : string, default='accuracy'
        Metric used to calculate the test scores. See
        ``research.metrics`` for info on available
        performance metrics.

    continue_training : bool, default=False
        If ``False``, fit a new classifier at each iteration. If ``True``, the
        classifier fitted in the previous iteration is used for further training in
        subsequent iterations.

    random_state : int, RandomState instance, default=None
        Control the randomization of the algorithm.

        - If int, ``random_state`` is the seed used by the random number
          generator;
        - If ``RandomState`` instance, random_state is the random number
          generator;
        - If ``None``, the random number generator is the ``RandomState``
          instance used by ``np.random``.

    Attributes
    ----------
    acquisition_func_ : function
        Method used to calculate the classification uncertainty at each iteration.
    evaluation_metric_ : scorer
        Metric used to estimate the performance of the AL classifier at each iteration.
    classifier_ : estimator object
        The classifier used in the iterative process. It is the classifier fitted in the
        last iteration.
    metadata_ : dict
        Contains the performance estimations, classifiers, labeled pool mask and original
        dataset.
    n_init_ : int
        Number of observations included in the initial training dataset.
    budget_ : int
        Number of observations to be added to the training set per iteration. Also known
        as budget.
    max_iter_ : int
        Maximum number of iterations allowed.
    labeled_pool_ : array-like of shape (n_samples,)
        Mask that filters the labeled observations from the original dataset.
    """

    def _initialization(self, X=None, y=None, initial_selection=None):
        labeled_pool = _random_initialization(self, X, y, initial_selection)
        return labeled_pool

    def _iteration(self, X, y, **kwargs):
        self.classifier_.fit(X[self.labeled_pool_], y[self.labeled_pool_])
        return self.classifier_.predict_proba(X[~self.labeled_pool_])

    def _oracle(self, probabilities):
        uncertainties = self.acquisition_func_(probabilities)
        unlabeled_ids = np.argwhere(~self.labeled_pool_).squeeze()

        ids = (
            unlabeled_ids[np.argsort(uncertainties)[::-1][: self.budget_]]
            if unlabeled_ids.ndim >= 1
            else unlabeled_ids.flatten()[0]
        )

        self.labeled_pool_[ids] = True
        return self




[docs]
class AugmentationAL(BaseActiveLearner, ClassifierMixin):
    """
    Active Learning with pipelined Data Augmentation. This method is implemented and
    analysed in a working paper.

    Parameters
    ----------
    classifier : classifier object, default=None
        Classifier or pipeline to be trained in the iterative process. If None, defaults
        to sklearn's RandomForestClassifier with default parameters and uses the
        ``random_state`` passed in the Active Learning model.

    generator : generator estimator, default=None
        Generator to be used for artificial data generation within Active
        Learning iterations.

    param_grid : dict or list of dictionaries
        Used to optimize the classifier and generator hyperparameters at each iteration
        via cross-validated grid-search. If None, parameter tuning is skipped.
        Dictionary with parameters names (``str``) as keys and lists of parameter
        settings to try as values, or a list of such dictionaries, in which case the
        grids spanned by each dictionary in the list are explored. This enables searching
        over any sequence of parameter settings.

    cv : int, cross-validation generator or an iterable, default=None
        Determines the cross-validation splitting strategy. Used to optimize the
        classifier and generator hyperparameters at each iteration.
        Possible inputs for cv are:

        - None, to use the default 5-fold cross validation,
        - integer, to specify the number of folds in a `(Stratified)KFold`,
        - `CV splitter`.

        For integer/None inputs, if the estimator is a classifier and ``y`` is
        either binary or multiclass, :class:`StratifiedKFold` is used. In all
        other cases, :class:`KFold` is used. These splitters are instantiated
        with `shuffle=False` so the splits will be the same across calls.

    acquisition_func : function or {'entropy', 'breaking_ties', 'random'}, default=None
        Method used to quantify the prediction's uncertainty level. All predefined
        functions are set up so that a higher value means higher uncertainty (higher
        likelihood of selection) and vice-versa. The uncertainty estimate is used to
        select the instances to be added to the labeled/training dataset. Acquisition
        functions may be added or changed in the ``UNCERTAINTY_FUNCTIONS`` dictionary.
        If None, defaults to "random".

    n_init : int or float, default=None
        Number of observations to include in the initial training dataset. If
        ``n_init`` < 1, then the corresponding percentage of the original dataset
        will be used as the initial training set. If None, defaults to 2% of the size of
        the original dataset.

    budget : int or float, default=None
        Number of observations to be added to the training dataset at each iteration. If
        ``budget`` < 1, then the corresponding percentage of the original dataset will be
        used as the initial training set. If None, defaults to 2% of the size of the
        original dataset.

    max_iter : int, default=None
        Maximum number of iterations allowed. If None, the experiment will run until 100%
        of the dataset is added to the training set.

    evaluation_metric : string, default='accuracy'
        Metric used to calculate the test scores. See
        ``mlresearch.metrics`` for info on available
        performance metrics.

    continue_training : bool, default=False
        If ``False``, fit a new classifier at each iteration. If ``True``, the
        classifier fitted in the previous iteration is used for further training in
        subsequent iterations.

    random_state : int, RandomState instance, default=None
        Control the randomization of the algorithm.

        - If int, ``random_state`` is the seed used by the random number generator;
        - If ``RandomState`` instance, random_state is the random number generator;
        - If ``None``, the random number generator is the ``RandomState`` instance used
          by ``np.random``.

    Attributes
    ----------
    acquisition_func_ : function
        Method used to calculate the classification uncertainty at each iteration.
    evaluation_metric_ : scorer
        Metric used to estimate the performance of the AL classifier at each iteration.
    classifier_ : estimator object
        The classifier used in the iterative process. It is the classifier fitted in the
        last iteration.
    metadata_ : dict
        Contains the performance estimations, classifiers, labeled pool mask and original
        dataset.
    n_init_ : int
        Number of observations included in the initial training dataset.
    budget_ : int
        Number of observations to be added to the training set per iteration.
    max_iter_ : int
        Maximum number of iterations allowed.
    labeled_pool_ : array-like of shape (n_samples,)
        Mask that filters the labeled observations from the original dataset.
    """

    def __init__(
        self,
        classifier: Union[BaseEstimator, ClassifierMixin] = None,
        generator: BaseOverSampler = None,
        param_grid: dict = None,
        cv=None,
        acquisition_func=None,
        n_init: Union[int, float] = None,
        budget: Union[int, float] = None,
        max_iter: int = None,
        evaluation_metric=None,
        continue_training: bool = False,
        random_state: int = None,
    ):
        super().__init__(
            classifier=classifier,
            acquisition_func=acquisition_func,
            n_init=n_init,
            budget=budget,
            max_iter=max_iter,
            evaluation_metric=evaluation_metric,
            continue_training=continue_training,
            random_state=random_state,
        )
        self.generator = generator
        self.param_grid = param_grid
        self.cv = cv

    def _check(self, X, y):
        super()._check(X, y)

        # Generator
        if (
            self.generator is not None
            and hasattr(self.generator, "random_state")
            and self.generator.random_state is None
            and self.random_state is not None
        ):
            # Check random state
            self._generator = clone(self.generator)
            self._generator.set_params(random_state=self.random_state)

            # Add generator to classifier as a pipeline
            generator = clone(self._generator)
            classifier = clone(self._classifier)
            self._classifier = Pipeline(
                [("generator", generator), ("classifier", classifier)]
            )

        # Check if parameters in param_grid are valid
        if type(self.param_grid) is dict:
            for key in self.param_grid.keys():
                if key not in self._classifier.get_params():
                    raise ValueError(
                        f"Invalid parameter {key} for generator or classifier in {self} "
                        "check the list of available parameters with "
                        "`almodel._classifier.get_params().keys()`."
                    )
        elif self.param_grid is not None:
            raise TypeError(
                f"``param_grid`` must be a dict or None. Got {self.param_grid} instead."
            )

    def _save_metadata(self, X, y, **kwargs):
        super()._save_metadata(X, y, **kwargs)
        if hasattr(self, "classifier_") and type(self.classifier_) is GridSearchCV:
            self.metadata_[self._current_iter]["parameters"] = {
                k: v
                for k, v in self.classifier_.best_estimator_.get_params().items()
                if k in self.param_grid.keys()
            }

    def _check_cross_validation(self, y):
        """Define cross-validation object"""

        min_frequency = np.unique(y, return_counts=True)[-1].min()
        cv = deepcopy(self.cv)

        if hasattr(self.cv, "n_splits"):
            cv.n_splits = min(min_frequency, cv.n_splits)
        elif type(self.cv) is int:
            cv = min(min_frequency, cv)
        elif cv is None:
            cv = min(min_frequency, 5)
        else:
            raise TypeError(
                "``cv`` object must be of type int or cross-validation generator. Got "
                f"{self.cv} instead"
            )
        return cv

    def _initialization(self, X=None, y=None, initial_selection=None):
        labeled_pool = _random_initialization(self, X, y, initial_selection)
        return labeled_pool

    def _iteration(self, X, y, **kwargs):
        # Set up parameter tuning within iterations
        cv = self._check_cross_validation(y[self.labeled_pool_])
        if self.param_grid is not None and cv != 1:
            self.classifier_ = GridSearchCV(
                estimator=self.classifier_,
                param_grid=self.param_grid,
                scoring=self.evaluation_metric_,
                cv=cv,
                refit=True,
            )
        self.classifier_.fit(X[self.labeled_pool_], y[self.labeled_pool_])
        return self.classifier_.predict_proba(X[~self.labeled_pool_])

    def _oracle(self, probabilities):
        uncertainties = self.acquisition_func_(probabilities)
        unlabeled_ids = np.argwhere(~self.labeled_pool_).squeeze()

        ids = (
            unlabeled_ids[np.argsort(uncertainties)[::-1][: self.budget_]]
            if unlabeled_ids.ndim >= 1
            else unlabeled_ids.flatten()[0]
        )

        self.labeled_pool_[ids] = True
        return self