Source code for mlresearch.metrics._synth_data_quality

"""
Implementation of the 3 synthetic data quality metrics from the paper 'How Faithful is
your Synthetic Data? Sample-level Metrics for Evaluating and Auditing Generative Models'
from Alaa et al (2022).
"""

import numpy as np
from sklearn.metrics._scorer import _BaseScorer
from sklearn.neighbors import NearestNeighbors


class _BaseSynthQualityScorer(_BaseScorer):
    def __repr__(self):
        kwargs_string = "".join([f", {k}={v}" for k, v in self.__dict__.items()])
        return f"make_scorer({self.__class__.__name__}{kwargs_string})"

    def set_score_request(self):
        """
        Placeholder to overwrite sklearn's ``_BaseScorer.set_score_request`` function.
        It is not used and was raising a docstring error with scikit-learn v1.3.0.

        Note
        ----
        This placeholder will be removed soon
        """
        pass



[docs]
class AlphaPrecision(_BaseSynthQualityScorer):
    """
    Measures synthetic data fidelity. It estimates the probability that a synthetic
    sample resides in the $\\alpha$-support of the real distribution.

    This is an implementation of the metric proposed in [1]_.

    .. warning::
        This metric is not listed in the ``get_scorer_names`` function since it is
        following an unconventional structure.

    Parameters
    ----------
    scorer_real : function
        Method used to map a dataset into a score, or a 1-dimensional projection of
        itself. The mapping should be modelled over the original (real) dataset.

    alpha : float, default=0.05
        Percentile used to determine the radius of the euclidean ball.

    Attributes
    ----------
    center_ : float
        Value of the center of the euclidean ball.

    References
    ----------

    .. [1] Alaa, A., Van Breugel, B., Saveliev, E. S., & van der Schaar, M. (2022, June).
        How faithful is your synthetic data? sample-level metrics for evaluating and
        auditing generative models. In International Conference on Machine Learning
        (pp. 290-306). PMLR.

    """

    def __init__(self, scorer_real, alpha=0.05):
        self.scorer_real = scorer_real
        self.alpha = alpha


[docs]
    def fit(self, X_real):
        """
        Compute statistics necessary to calculate $\\alpha$-precision.

        Parameters
        ----------
        X_real : array-like or pd.DataFrame, shape (n_samples, n_features)
            The real (original) dataset used to fit `self.scorer_real`.

        Returns
        -------
        self : object
            Returns an instance of the class.
        """
        original_scores = self.scorer_real(X_real)
        self.center_ = np.median(original_scores)
        self._dist = np.abs(original_scores - self.center_)
        return self



[docs]
    def score(self, X_synth):
        """
        Returns 1 if a sample resides in the $\\alpha$-support of the original
        distribution, 0 otherwise.

        Parameters
        ----------
        X : array-like or pd.DataFrame, shape (n_samples, n_features)
            Input data over which $\\alpha$-precision will be calculated.

        Returns
        -------
        scores : np.ndarray, shape (n_samples,)
            $\\alpha$-precision scores.
        """
        radius = np.quantile(self._dist, 1 - self.alpha)
        within_ball = np.abs(self.scorer_real(X_synth) - self.center_) < radius
        return within_ball.astype(int)





[docs]
class BetaRecall(_BaseSynthQualityScorer):
    """
    Checks whether the synthetic data is diverse enough to cover the variability of real
    data, i.e., a model should be able to generate a wide variety of good samples.

    This is an implementation of the metric proposed in [1]_.

    .. warning::
        This metric is not listed in the ``get_scorer_names`` function since it is
        following an unconventional structure.

    Parameters
    ----------
    scorer_synth : function
        Method used to map a dataset into a score, or a 1-dimensional projection of
        itself. The mapping should be modelled over the synthetic dataset.

    beta : float, default=0.05
        Percentile used to determine the radius of the euclidean ball.

    n_neighbors : int, default=5
        Number of neighbors to use by default for computing the radius for each sample
        in `X_real` for scoring. Ignored if `scorer_synth` is not `None`.

    metric : str or callable, default='euclidean'
        Metric to use for distance computation. Default is "euclidean", which
        results in the standard Euclidean distance. See the
        documentation of `scipy.spatial.distance
        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
        the metrics listed in
        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
        values.

        If metric is a callable function, it takes two arrays representing 1D
        vectors as inputs and must return one value indicating the distance
        between those vectors. This works for Scipy's metrics, but is less
        efficient than passing the metric name as a string.

        Ignored if `scorer_synth` is not `None`.

    n_jobs : int, default=None
        The number of parallel jobs to run for neighbors search.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. Ignored if `scorer_synth` is not `None`.

    Attributes
    ----------
    center_ : float
        Value of the center of the euclidean ball.

    References
    ----------

    .. [1] Alaa, A., Van Breugel, B., Saveliev, E. S., & van der Schaar, M. (2022, June).
        How faithful is your synthetic data? sample-level metrics for evaluating and
        auditing generative models. In International Conference on Machine Learning
        (pp. 290-306). PMLR.

    """

    def __init__(
        self,
        scorer_synth=None,
        beta=0.05,
        n_neighbors=5,
        metric="euclidean",
        n_jobs=None,
    ):
        self.scorer_synth = scorer_synth
        self.beta = beta
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.n_jobs = n_jobs

    def _fit_with_knn(self, X_synth):
        self.center_ = X_synth.mean(axis=0)
        self._dist = np.linalg.norm(X_synth - self.center_, axis=1)

        self.radius_ = np.quantile(self._dist, 1 - self.beta)
        within_ball = np.linalg.norm(X_synth - self.center_, axis=1) < self.radius_
        self.X_synth_ = X_synth[within_ball].copy()
        return self

    def _fit_with_support_estimation(self, X_synth):
        original_scores = self.scorer_synth(X_synth).reshape(-1, 1)
        self.center_ = np.median(original_scores, axis=0)
        self._dist = np.linalg.norm(original_scores - self.center_, axis=1)

        self.radius_ = np.quantile(self._dist, 1 - self.beta)
        return self


[docs]
    def fit(self, X_synth):
        """
        Compute statistics necessary to calculate $\\beta$-recall.

        Parameters
        ----------
        X_synth : array-like or pd.DataFrame, shape (n_samples, n_features)
            The synthetic dataset used to fit `self.scorer_synth`.

        Returns
        -------
        self : object
            Returns an instance of the class.
        """
        if self.scorer_synth is None:
            self._fit_with_knn(X_synth)
        else:
            self._fit_with_support_estimation(X_synth)
        return self


    def _score_with_knn(self, X_real):
        nn_real_ = NearestNeighbors(
            n_neighbors=self.n_neighbors, metric=self.metric, n_jobs=self.n_jobs
        ).fit(X_real)

        radius = nn_real_.kneighbors(X_real)[1][:, -1]

        nn_synth_ = NearestNeighbors(
            n_neighbors=1, metric=self.metric, n_jobs=self.n_jobs
        ).fit(self.X_synth_)
        dists = nn_synth_.kneighbors(X_real)[1][:, -1]

        return dists < radius

    def _score_with_support_estimation(self, X_real):
        scores = self.scorer_synth(X_real).reshape(-1, 1)
        within_ball = np.linalg.norm(scores - self.center_, axis=1) < self.radius_
        return within_ball


[docs]
    def score(self, X_real):
        """
        Returns 1 if a sample resides in the $\\beta$-support of the synthetic
        distribution, 0 otherwise.

        Parameters
        ----------
        X_real : array-like or pd.DataFrame, shape (n_samples, n_features)
            Input data over which $\\beta$-recall will be calculated.

        Returns
        -------
        scores : np.ndarray, shape (n_samples,)
            $\\beta$-recall scores.
        """
        if self.scorer_synth is None:
            scores = self._score_with_knn(X_real)
        else:
            scores = self._score_with_support_estimation(X_real)
        return scores.astype(int)





[docs]
class Authenticity(_BaseSynthQualityScorer):
    """
    Quantifies the rate by which a model generates new samples. In other words, this
    scorer assesses whether a sample is non-memorized.

    This is an implementation of the metric proposed in [1]_.

    .. warning::
        This metric is not listed in the ``get_scorer_names`` function since it is
        following an unconventional structure.

    Parameters
    ----------
    metric : str or callable, default='euclidean'
        Metric to use for distance computation. Default is "euclidean", which
        results in the standard Euclidean distance. See the
        documentation of `scipy.spatial.distance
        <https://docs.scipy.org/doc/scipy/reference/spatial.distance.html>`_ and
        the metrics listed in
        :class:`~sklearn.metrics.pairwise.distance_metrics` for valid metric
        values.

        If metric is a callable function, it takes two arrays representing 1D
        vectors as inputs and must return one value indicating the distance
        between those vectors. This works for Scipy's metrics, but is less
        efficient than passing the metric name as a string.

    n_jobs : int, default=None
        The number of parallel jobs to run for neighbors search.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors.

    Attributes
    ----------
    nn_ : estimator object
        Validated k-nearest neighbours algorithm. Used to find the nearest neighbors of
        the synthetic and the original data using the original data as a reference.

    distances_real_ : np.ndarray, shape (n_samples,)
        Distance to the nearest neighbor for each sample in `X_real`.

    References
    ----------

    .. [1] Alaa, A., Van Breugel, B., Saveliev, E. S., & van der Schaar, M. (2022, June).
        How faithful is your synthetic data? sample-level metrics for evaluating and
        auditing generative models. In International Conference on Machine Learning
        (pp. 290-306). PMLR.

    """

    def __init__(self, metric="euclidean", n_jobs=None):
        self.metric = metric
        self.n_jobs = n_jobs


[docs]
    def fit(self, X_real):
        """
        Compute statistics necessary to calculate Authenticity.

        Parameters
        ----------
        X_real : array-like or pd.DataFrame, shape (n_samples, n_features)
            The real (original) dataset used to fit `self.scorer_real`.

        Returns
        -------
        self : object
            Returns an instance of the class.
        """
        self.nn_ = NearestNeighbors(
            n_neighbors=2, metric=self.metric, n_jobs=self.n_jobs
        ).fit(X_real)
        distances, neighbors = self.nn_.kneighbors(X_real)
        self.distances_real_ = distances[:, 1]
        return self



[docs]
    def score(self, X):
        """
        Returns 1 if an observation is deemed authentic, 0 otherwise.

        Parameters
        ----------
        X : array-like or pd.DataFrame, shape (n_samples, n_features)
            Input data over which Authenticity will be calculated.

        Returns
        -------
        scores : np.ndarray, shape (n_samples,)
            Authenticity scores.
        """
        distances, neighbors = self.nn_.kneighbors(X)
        distances = distances[:, 0]
        neighbors = neighbors[:, 0]
        a_j = 1 - (distances < self.distances_real_[neighbors]).astype(int)
        return a_j