Source code for mlresearch.datasets._binary

"""
Download, transform and simulate various binary datasets.
"""

# Author: Georgios Douzas <gdouzas@icloud.com>
#         Joao Fonseca <jpfonseca@novaims.unl.pt>
# License: MIT

from re import sub
from collections import Counter
from itertools import product
from urllib.parse import urljoin
from string import ascii_lowercase
from zipfile import ZipFile
from io import BytesIO, StringIO

import requests
import numpy as np
import pandas as pd
from sklearn.utils import check_X_y
from imblearn.datasets import make_imbalance

from .base import Datasets, FETCH_URLS, RANDOM_STATE



[docs]
class ImbalancedBinaryDatasets(Datasets):
    """Class to download, transform and save binary class imbalanced
    datasets."""

    MULTIPLICATION_FACTORS = [2, 3]

    @staticmethod
    def _calculate_ratio(multiplication_factor, y):
        """Calculate ratio based on IRs multiplication factor."""
        ratio = Counter(y).copy()
        ratio[1] = int(ratio[1] / multiplication_factor)
        return ratio

    def _make_imbalance(self, data, multiplication_factor):
        """Undersample the minority class."""
        X_columns = [col for col in data.columns if col != "target"]
        X, y = check_X_y(data.loc[:, X_columns], data.target)
        if multiplication_factor > 1.0:
            sampling_strategy = self._calculate_ratio(multiplication_factor, y)
            X, y = make_imbalance(
                X, y, sampling_strategy=sampling_strategy, random_state=RANDOM_STATE
            )
        data = pd.DataFrame(np.column_stack((X, y)))
        data[data.columns[-1]] = data[data.columns[-1]].astype(int)
        data.rename(columns={data.columns[-1]: "target"}, inplace=True)
        return data


[docs]
    def download(self):
        """Download the datasets and append undersampled versions of them."""
        super(ImbalancedBinaryDatasets, self).download()
        undersampled_datasets = []
        for (name, data), factor in list(
            product(self.content_, self.MULTIPLICATION_FACTORS)
        ):
            ratio = self._calculate_ratio(factor, data.target)
            if ratio[1] >= 15:
                data = self._make_imbalance(data, factor)
                undersampled_datasets.append((f"{name} ({factor})", data))
        self.content_ += undersampled_datasets
        return self



[docs]
    def fetch_breast_tissue(self):
        """Download and transform the Breast Tissue Data Set.
        The minority class is identified as the `car` and `fad`
        labels and the majority class as the rest of the labels.

        http://archive.ics.uci.edu/ml/datasets/breast+tissue
        """
        zipped_data = requests.get(FETCH_URLS["breast_tissue"]).content
        unzipped_data = ZipFile(BytesIO(zipped_data)).read("BreastTissue.xls")
        data = pd.read_excel(BytesIO(unzipped_data), sheet_name="Data")

        data = data.drop(columns="Case #").rename(columns={"Class": "target"})
        data["target"] = data["target"].isin(["car", "fad"]).astype(int)
        return data



[docs]
    def fetch_ecoli(self):
        """Download and transform the Ecoli Data Set.
        The minority class is identified as the `pp` label
        and the majority class as the rest of the labels.

        https://archive.ics.uci.edu/ml/datasets/ecoli
        """
        data = pd.read_csv(FETCH_URLS["ecoli"], header=None, delim_whitespace=True)
        data = data.drop(columns=0).rename(columns={8: "target"})
        data["target"] = data["target"].isin(["pp"]).astype(int)
        return data



[docs]
    def fetch_eucalyptus(self):
        """Download and transform the Eucalyptus Data Set.
        The minority class is identified as the `best` label
        and the majority class as the rest of the labels.

        https://www.openml.org/d/188
        """
        data = pd.read_csv(FETCH_URLS["eucalyptus"])
        data = data.iloc[:, -9:].rename(columns={"Utility": "target"})
        data = data[data != "?"].dropna()
        data["target"] = data["target"].isin(["best"]).astype(int)
        return data



[docs]
    def fetch_glass(self):
        """Download and transform the Glass Identification Data Set.
        The minority class is identified as the `1` label
        and the majority class as the rest of the labels.

        https://archive.ics.uci.edu/ml/datasets/glass+identification
        """
        data = pd.read_csv(FETCH_URLS["glass"], header=None)
        data = data.drop(columns=0).rename(columns={10: "target"})
        data["target"] = data["target"].isin([1]).astype(int)
        return data



[docs]
    def fetch_haberman(self):
        """Download and transform the Haberman's Survival Data Set.
        The minority class is identified as the `1` label
        and the majority class as the `0` label.

        https://archive.ics.uci.edu/dataset/43/haberman+s+survival
        """
        zipped_data = requests.get(FETCH_URLS["haberman"]).content
        unzipped_data = ZipFile(BytesIO(zipped_data)).read("haberman.data")
        data = pd.read_csv(BytesIO(unzipped_data), header=None)

        data.rename(columns={3: "target"}, inplace=True)
        data["target"] = data["target"].isin([2]).astype(int)
        return data



[docs]
    def fetch_heart(self):
        """Download and transform the Heart Data Set.
        The minority class is identified as the `2` label
        and the majority class as the `1` label.

        http://archive.ics.uci.edu/ml/datasets/statlog+(heart)
        """
        data = pd.read_csv(FETCH_URLS["heart"], header=None, delim_whitespace=True)
        data.rename(columns={13: "target"}, inplace=True)
        data["target"] = data["target"].isin([2]).astype(int)
        return data



[docs]
    def fetch_iris(self):
        """Download and transform the Iris Data Set.
        The minority class is identified as the `1` label
        and the majority class as the rest of the labels.

        https://archive.ics.uci.edu/ml/datasets/iris
        """
        data = pd.read_csv(FETCH_URLS["iris"], header=None)
        data.rename(columns={4: "target"}, inplace=True)
        data["target"] = data["target"].isin(["Iris-setosa"]).astype(int)
        return data



[docs]
    def fetch_libras(self):
        """Download and transform the Libras Movement Data Set.
        The minority class is identified as the `1` label
        and the majority class as the rest of the labels.

        https://archive.ics.uci.edu/ml/datasets/Libras+Movement
        """
        data = pd.read_csv(FETCH_URLS["libras"], header=None)
        data.rename(columns={90: "target"}, inplace=True)
        data["target"] = data["target"].isin([1]).astype(int)
        return data



[docs]
    def fetch_liver(self):
        """Download and transform the Liver Disorders Data Set.
        The minority class is identified as the `1` label
        and the majority class as the '2' label.

        https://archive.ics.uci.edu/ml/datasets/liver+disorders
        """
        zipped_data = requests.get(FETCH_URLS["liver"]).content
        unzipped_data = ZipFile(BytesIO(zipped_data)).read("bupa.data")
        data = pd.read_csv(BytesIO(unzipped_data), header=None)

        data.rename(columns={6: "target"}, inplace=True)
        data["target"] = data["target"].isin([1]).astype(int)
        return data



[docs]
    def fetch_pima(self):
        """Download and transform the Pima Indians Diabetes Data Set.
        The minority class is identified as the `1` label
        and the majority class as the '0' label.

        https://www.kaggle.com/uciml/pima-indians-diabetes-database
        """
        data = pd.read_csv(FETCH_URLS["pima"], header=None, skiprows=9)
        data.rename(columns={8: "target"}, inplace=True)
        return data



[docs]
    def fetch_vehicle(self):
        """Download and transform the Vehicle Silhouettes Data Set.
        The minority class is identified as the `1` label
        and the majority class as the rest of the labels.

        https://archive.ics.uci.edu/ml/datasets/Statlog+(Vehicle+Silhouettes)
        """
        data = []
        for letter in ascii_lowercase[0:9]:
            partial_data = pd.read_csv(
                urljoin(
                    FETCH_URLS["vehicle"].replace("Index", ""), "xa%s.dat" % letter
                ),
                header=None,
                delim_whitespace=True,
            )
            partial_data = partial_data.rename(columns={18: "target"})
            partial_data["target"] = partial_data["target"].isin(["van"]).astype(int)
            data.append(partial_data)
        return pd.concat(data)



[docs]
    def fetch_wine(self):
        """Download and transform the Wine Data Set.
        The minority class is identified as the `2` label
        and the majority class as the rest of the labels.

        https://archive.ics.uci.edu/ml/datasets/wine
        """
        data = pd.read_csv(FETCH_URLS["wine"], header=None)
        data.rename(columns={0: "target"}, inplace=True)
        data["target"] = data["target"].isin([2]).astype(int)
        return data



[docs]
    def fetch_new_thyroid_1(self):
        """Download and transform the Thyroid Disease Data Set.
        The minority class is identified as the `positive`
        label and the majority class as the `negative` label.

        .. note:: The positive class was originally label 2.

        https://archive.ics.uci.edu/ml/datasets/Thyroid+Disease
        """
        data = pd.read_csv(
            FETCH_URLS["new_thyroid"],
            header=None,
        )
        data.rename(columns={0: "target"}, inplace=True)
        data["target"] = (data["target"] == 2).astype(int)
        return data



[docs]
    def fetch_new_thyroid_2(self):
        """Download and transform the Thyroid Disease Data Set.
        The minority class is identified as the `positive`
        label and the majority class as the `negative` label.

        .. note:: The positive class was originally label 3.

        https://archive.ics.uci.edu/ml/datasets/Thyroid+Disease
        """
        data = pd.read_csv(
            FETCH_URLS["new_thyroid"],
            header=None,
        )
        data.rename(columns={0: "target"}, inplace=True)
        data["target"] = (data["target"] == 3).astype(int)
        return data



[docs]
    def fetch_cleveland(self):
        """Download and transform the Heart Disease Cleveland Data Set.
        The minority class is identified as the `positive` label and
        the majority class as the `negative` label.

        https://archive.ics.uci.edu/ml/datasets/heart+disease
        """
        data = pd.read_csv(FETCH_URLS["cleveland"], header=None, na_values="?")
        data.dropna(inplace=True)
        data.rename(columns={13: "target"}, inplace=True)
        data["target"] = (data["target"] == 1).astype(int)
        return data



[docs]
    def fetch_dermatology(self):
        """Download and transform the Dermatology Data Set.
        The minority class is identified as the `positive` label and
        the majority class as the `negative` label.

        https://archive.ics.uci.edu/ml/datasets/Dermatology
        """
        data = pd.read_csv(FETCH_URLS["dermatology"], header=None)
        data.rename(columns={34: "target"}, inplace=True)
        data.drop(columns=33, inplace=True)
        data["target"] = (data.target == 6).astype(int)
        return data



[docs]
    def fetch_led(self):
        """Download and transform the LED Display Domain Data Set.
        The minority class is identified as the `positive` label and
        the majority class as the `negative` label.

        https://www.openml.org/d/40496
        """
        data = pd.read_csv(FETCH_URLS["led"])
        data.rename(columns={"Class": "target"}, inplace=True)
        data["target"] = (data.target == 1).astype(int)
        return data



[docs]
    def fetch_page_blocks(self):
        """Download and transform the Page Blocks Data Set.
        The minority class is identified as the `positive` label and
        the majority class as the `negative` label.

        https://www.openml.org/d/1021
        """
        data = pd.read_csv(FETCH_URLS["page_blocks"])
        data.rename(columns={"class": "target"}, inplace=True)
        data["target"] = (data.target != 1).astype(int)
        return data



[docs]
    def fetch_vowel(self):
        """Download and transform the Vowel Recognition Data Set.
        The minority class is identified as the `positive` label and
        the majority class as the `negative` label.

        https://www.openml.org/d/375
        """

        data = pd.read_csv(FETCH_URLS["vowels"])
        data.rename(columns={"speaker": "target"}, inplace=True)
        data.drop(columns=["utterance", "frame"], inplace=True)
        data["target"] = (data["target"] == 1).astype(int)
        return data



[docs]
    def fetch_yeast(self):
        """Download and transform the Yeast Data Set.
        The minority class is identified as the `positive` label and
        the majority class as the `negative` label.

        https://archive.ics.uci.edu/ml/datasets/Yeast
        """
        data = pd.read_csv(FETCH_URLS["yeast"], header=None)
        data = pd.DataFrame(
            [
                [val for val in row.split(" ") if len(val) != 0]
                for row in data[0].tolist()
            ]
        )
        data.drop(columns=0, inplace=True)
        data.rename(columns={9: "target"}, inplace=True)
        data["target"] = (data["target"] == "MIT").astype(int)
        return data





[docs]
class BinaryDatasets(Datasets):
    """Class to download, transform and save binary class datasets."""


[docs]
    def fetch_banknote_authentication(self):
        """Download and transform the Banknote Authentication Data Set.

        https://archive.ics.uci.edu/ml/datasets/banknote+authentication
        """
        data = pd.read_csv(FETCH_URLS["banknote_authentication"], header=None)
        data.rename(columns={4: "target"}, inplace=True)
        return data



[docs]
    def fetch_arcene(self):
        """Download and transform the Arcene Data Set.

        https://archive.ics.uci.edu/ml/datasets/Arcene
        """

        zipped_data = requests.get(FETCH_URLS["arcene"]).content
        zipped_data = ZipFile(BytesIO(zipped_data))

        data, labels = [], []
        for data_type in ("train", "valid"):
            data.append(
                pd.read_csv(
                    StringIO(
                        zipped_data.read(f"ARCENE/arcene_{data_type}.data").decode(
                            "utf-8"
                        )
                    ),
                    header=None,
                    sep=" ",
                ).drop(columns=list(range(1998, 10001)))
            )
            labels.append(
                pd.read_csv(
                    StringIO(
                        zipped_data.read(
                            ("ARCENE/" if data_type == "train" else "")
                            + f"arcene_{data_type}.labels"
                        ).decode("utf-8")
                    ),
                    header=None,
                ).rename(columns={0: "target"})
            )

        data = pd.concat(data, ignore_index=True)
        labels = pd.concat(labels, ignore_index=True)
        data = pd.concat([data, labels], axis=1)
        data["target"] = data["target"].isin([1]).astype(int)
        return data



[docs]
    def fetch_audit(self):
        """Download and transform the Audit Data Set.

        https://archive.ics.uci.edu/ml/datasets/Audit+Data
        """
        zipped_data = requests.get(FETCH_URLS["audit"]).content
        unzipped_data = (
            ZipFile(BytesIO(zipped_data))
            .read("audit_data/audit_risk.csv")
            .decode("utf-8")
        )
        data = pd.read_csv(StringIO(sub(r"@.+\n+", "", unzipped_data)), engine="python")
        data = (
            data.drop(columns=["LOCATION_ID"])
            .rename(columns={"Risk": "target"})
            .dropna()
        )
        return data



[docs]
    def fetch_spambase(self):
        """Download and transform the Spambase Data Set.

        https://archive.ics.uci.edu/ml/datasets/Spambase
        """
        data = pd.read_csv(FETCH_URLS["spambase"], header=None)
        data.rename(columns={57: "target"}, inplace=True)
        return data



[docs]
    def fetch_parkinsons(self):
        """Download and transform the Parkinsons Data Set.

        https://archive.ics.uci.edu/ml/datasets/parkinsons
        """
        data = pd.read_csv(FETCH_URLS["parkinsons"])
        data = pd.concat(
            [
                data.drop(columns=["name", "status"]),
                data[["status"]].rename(columns={"status": "target"}),
            ],
            axis=1,
        )
        data["target"] = data["target"].isin([0]).astype(int)
        return data



[docs]
    def fetch_ionosphere(self):
        """Download and transform the Ionosphere Data Set.

        https://archive.ics.uci.edu/ml/datasets/ionosphere
        """
        data = pd.read_csv(FETCH_URLS["ionosphere"], header=None)
        data = data.drop(columns=[0, 1]).rename(columns={34: "target"})
        data["target"] = data["target"].isin(["b"]).astype(int)
        return data



[docs]
    def fetch_breast_cancer(self):
        """Download and transform the Breast Cancer Wisconsin Data Set.

        https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
        """
        data = pd.read_csv(FETCH_URLS["breast_cancer"], header=None)
        data = pd.concat(
            [data.drop(columns=[0, 1]), data[[1]].rename(columns={1: "target"})], axis=1
        )
        data["target"] = data["target"].isin(["M"]).astype(int)
        return data