"""
Download, transform and simulate various binary datasets.
"""
# Author: Georgios Douzas <gdouzas@icloud.com>
# Joao Fonseca <jpfonseca@novaims.unl.pt>
# License: MIT
from re import sub
from collections import Counter
from itertools import product
from urllib.parse import urljoin
from string import ascii_lowercase
from zipfile import ZipFile
from io import BytesIO, StringIO
import requests
import numpy as np
import pandas as pd
from sklearn.utils import check_X_y
from imblearn.datasets import make_imbalance
from .base import Datasets, FETCH_URLS, RANDOM_STATE
[docs]
class ImbalancedBinaryDatasets(Datasets):
"""Class to download, transform and save binary class imbalanced
datasets."""
MULTIPLICATION_FACTORS = [2, 3]
@staticmethod
def _calculate_ratio(multiplication_factor, y):
"""Calculate ratio based on IRs multiplication factor."""
ratio = Counter(y).copy()
ratio[1] = int(ratio[1] / multiplication_factor)
return ratio
def _make_imbalance(self, data, multiplication_factor):
"""Undersample the minority class."""
X_columns = [col for col in data.columns if col != "target"]
X, y = check_X_y(data.loc[:, X_columns], data.target)
if multiplication_factor > 1.0:
sampling_strategy = self._calculate_ratio(multiplication_factor, y)
X, y = make_imbalance(
X, y, sampling_strategy=sampling_strategy, random_state=RANDOM_STATE
)
data = pd.DataFrame(np.column_stack((X, y)))
data[data.columns[-1]] = data[data.columns[-1]].astype(int)
data.rename(columns={data.columns[-1]: "target"}, inplace=True)
return data
[docs]
def download(self):
"""Download the datasets and append undersampled versions of them."""
super(ImbalancedBinaryDatasets, self).download()
undersampled_datasets = []
for (name, data), factor in list(
product(self.content_, self.MULTIPLICATION_FACTORS)
):
ratio = self._calculate_ratio(factor, data.target)
if ratio[1] >= 15:
data = self._make_imbalance(data, factor)
undersampled_datasets.append((f"{name} ({factor})", data))
self.content_ += undersampled_datasets
return self
[docs]
def fetch_breast_tissue(self):
"""Download and transform the Breast Tissue Data Set.
The minority class is identified as the `car` and `fad`
labels and the majority class as the rest of the labels.
http://archive.ics.uci.edu/ml/datasets/breast+tissue
"""
data = pd.read_excel(FETCH_URLS["breast_tissue"], sheet_name="Data")
data = data.drop(columns="Case #").rename(columns={"Class": "target"})
data["target"] = data["target"].isin(["car", "fad"]).astype(int)
return data
[docs]
def fetch_ecoli(self):
"""Download and transform the Ecoli Data Set.
The minority class is identified as the `pp` label
and the majority class as the rest of the labels.
https://archive.ics.uci.edu/ml/datasets/ecoli
"""
data = pd.read_csv(FETCH_URLS["ecoli"], header=None, delim_whitespace=True)
data = data.drop(columns=0).rename(columns={8: "target"})
data["target"] = data["target"].isin(["pp"]).astype(int)
return data
[docs]
def fetch_eucalyptus(self):
"""Download and transform the Eucalyptus Data Set.
The minority class is identified as the `best` label
and the majority class as the rest of the labels.
https://www.openml.org/d/188
"""
data = pd.read_csv(FETCH_URLS["eucalyptus"])
data = data.iloc[:, -9:].rename(columns={"Utility": "target"})
data = data[data != "?"].dropna()
data["target"] = data["target"].isin(["best"]).astype(int)
return data
[docs]
def fetch_glass(self):
"""Download and transform the Glass Identification Data Set.
The minority class is identified as the `1` label
and the majority class as the rest of the labels.
https://archive.ics.uci.edu/ml/datasets/glass+identification
"""
data = pd.read_csv(FETCH_URLS["glass"], header=None)
data = data.drop(columns=0).rename(columns={10: "target"})
data["target"] = data["target"].isin([1]).astype(int)
return data
[docs]
def fetch_haberman(self):
"""Download and transform the Haberman's Survival Data Set.
The minority class is identified as the `1` label
and the majority class as the `0` label.
https://archive.ics.uci.edu/ml/datasets/Haberman's+Survival
"""
data = pd.read_csv(FETCH_URLS["haberman"], header=None)
data.rename(columns={3: "target"}, inplace=True)
data["target"] = data["target"].isin([2]).astype(int)
return data
[docs]
def fetch_heart(self):
"""Download and transform the Heart Data Set.
The minority class is identified as the `2` label
and the majority class as the `1` label.
http://archive.ics.uci.edu/ml/datasets/statlog+(heart)
"""
data = pd.read_csv(FETCH_URLS["heart"], header=None, delim_whitespace=True)
data.rename(columns={13: "target"}, inplace=True)
data["target"] = data["target"].isin([2]).astype(int)
return data
[docs]
def fetch_iris(self):
"""Download and transform the Iris Data Set.
The minority class is identified as the `1` label
and the majority class as the rest of the labels.
https://archive.ics.uci.edu/ml/datasets/iris
"""
data = pd.read_csv(FETCH_URLS["iris"], header=None)
data.rename(columns={4: "target"}, inplace=True)
data["target"] = data["target"].isin(["Iris-setosa"]).astype(int)
return data
[docs]
def fetch_libras(self):
"""Download and transform the Libras Movement Data Set.
The minority class is identified as the `1` label
and the majority class as the rest of the labels.
https://archive.ics.uci.edu/ml/datasets/Libras+Movement
"""
data = pd.read_csv(FETCH_URLS["libras"], header=None)
data.rename(columns={90: "target"}, inplace=True)
data["target"] = data["target"].isin([1]).astype(int)
return data
[docs]
def fetch_liver(self):
"""Download and transform the Liver Disorders Data Set.
The minority class is identified as the `1` label
and the majority class as the '2' label.
https://archive.ics.uci.edu/ml/datasets/liver+disorders
"""
data = pd.read_csv(FETCH_URLS["liver"], header=None)
data.rename(columns={6: "target"}, inplace=True)
data["target"] = data["target"].isin([1]).astype(int)
return data
[docs]
def fetch_pima(self):
"""Download and transform the Pima Indians Diabetes Data Set.
The minority class is identified as the `1` label
and the majority class as the '0' label.
https://www.kaggle.com/uciml/pima-indians-diabetes-database
"""
data = pd.read_csv(FETCH_URLS["pima"], header=None, skiprows=9)
data.rename(columns={8: "target"}, inplace=True)
return data
[docs]
def fetch_vehicle(self):
"""Download and transform the Vehicle Silhouettes Data Set.
The minority class is identified as the `1` label
and the majority class as the rest of the labels.
https://archive.ics.uci.edu/ml/datasets/Statlog+(Vehicle+Silhouettes)
"""
data = []
for letter in ascii_lowercase[0:9]:
partial_data = pd.read_csv(
urljoin(
FETCH_URLS["vehicle"].replace("Index", ""), "xa%s.dat" % letter
),
header=None,
delim_whitespace=True,
)
partial_data = partial_data.rename(columns={18: "target"})
partial_data["target"] = partial_data["target"].isin(["van"]).astype(int)
data.append(partial_data)
return pd.concat(data)
[docs]
def fetch_wine(self):
"""Download and transform the Wine Data Set.
The minority class is identified as the `2` label
and the majority class as the rest of the labels.
https://archive.ics.uci.edu/ml/datasets/wine
"""
data = pd.read_csv(FETCH_URLS["wine"], header=None)
data.rename(columns={0: "target"}, inplace=True)
data["target"] = data["target"].isin([2]).astype(int)
return data
[docs]
def fetch_new_thyroid_1(self):
"""Download and transform the Thyroid Disease Data Set.
The minority class is identified as the `positive`
label and the majority class as the `negative` label.
.. note:: The positive class was originally label 2.
https://archive.ics.uci.edu/ml/datasets/Thyroid+Disease
"""
data = pd.read_csv(
FETCH_URLS["new_thyroid"],
header=None,
)
data.rename(columns={0: "target"}, inplace=True)
data["target"] = (data["target"] == 2).astype(int)
return data
[docs]
def fetch_new_thyroid_2(self):
"""Download and transform the Thyroid Disease Data Set.
The minority class is identified as the `positive`
label and the majority class as the `negative` label.
.. note:: The positive class was originally label 3.
https://archive.ics.uci.edu/ml/datasets/Thyroid+Disease
"""
data = pd.read_csv(
FETCH_URLS["new_thyroid"],
header=None,
)
data.rename(columns={0: "target"}, inplace=True)
data["target"] = (data["target"] == 3).astype(int)
return data
[docs]
def fetch_cleveland(self):
"""Download and transform the Heart Disease Cleveland Data Set.
The minority class is identified as the `positive` label and
the majority class as the `negative` label.
https://archive.ics.uci.edu/ml/datasets/heart+disease
"""
data = pd.read_csv(FETCH_URLS["cleveland"], header=None, na_values="?")
data.dropna(inplace=True)
data.rename(columns={13: "target"}, inplace=True)
data["target"] = (data["target"] == 1).astype(int)
return data
[docs]
def fetch_dermatology(self):
"""Download and transform the Dermatology Data Set.
The minority class is identified as the `positive` label and
the majority class as the `negative` label.
https://archive.ics.uci.edu/ml/datasets/Dermatology
"""
data = pd.read_csv(FETCH_URLS["dermatology"], header=None)
data.rename(columns={34: "target"}, inplace=True)
data.drop(columns=33, inplace=True)
data["target"] = (data.target == 6).astype(int)
return data
[docs]
def fetch_led(self):
"""Download and transform the LED Display Domain Data Set.
The minority class is identified as the `positive` label and
the majority class as the `negative` label.
https://www.openml.org/d/40496
"""
data = pd.read_csv(FETCH_URLS["led"])
data.rename(columns={"Class": "target"}, inplace=True)
data["target"] = (data.target == 1).astype(int)
return data
[docs]
def fetch_page_blocks(self):
"""Download and transform the Page Blocks Data Set.
The minority class is identified as the `positive` label and
the majority class as the `negative` label.
https://www.openml.org/d/1021
"""
data = pd.read_csv(FETCH_URLS["page_blocks"])
data.rename(columns={"class": "target"}, inplace=True)
data["target"] = (data.target != 1).astype(int)
return data
[docs]
def fetch_vowel(self):
"""Download and transform the Vowel Recognition Data Set.
The minority class is identified as the `positive` label and
the majority class as the `negative` label.
https://www.openml.org/d/375
"""
data = pd.read_csv(FETCH_URLS["vowels"])
data.rename(columns={"speaker": "target"}, inplace=True)
data.drop(columns=["utterance", "frame"], inplace=True)
data["target"] = (data["target"] == 1).astype(int)
return data
[docs]
def fetch_yeast(self):
"""Download and transform the Yeast Data Set.
The minority class is identified as the `positive` label and
the majority class as the `negative` label.
https://archive.ics.uci.edu/ml/datasets/Yeast
"""
data = pd.read_csv(FETCH_URLS["yeast"], header=None)
data = pd.DataFrame(
[
[val for val in row.split(" ") if len(val) != 0]
for row in data[0].tolist()
]
)
data.drop(columns=0, inplace=True)
data.rename(columns={9: "target"}, inplace=True)
data["target"] = (data["target"] == "MIT").astype(int)
return data
[docs]
class BinaryDatasets(Datasets):
"""Class to download, transform and save binary class datasets."""
[docs]
def fetch_banknote_authentication(self):
"""Download and transform the Banknote Authentication Data Set.
https://archive.ics.uci.edu/ml/datasets/banknote+authentication
"""
data = pd.read_csv(FETCH_URLS["banknote_authentication"], header=None)
data.rename(columns={4: "target"}, inplace=True)
return data
[docs]
def fetch_arcene(self):
"""Download and transform the Arcene Data Set.
https://archive.ics.uci.edu/ml/datasets/Arcene
"""
zipped_data = requests.get(FETCH_URLS["arcene"]).content
zipped_data = ZipFile(BytesIO(zipped_data))
data, labels = [], []
for data_type in ("train", "valid"):
data.append(
pd.read_csv(
StringIO(
zipped_data.read(f"ARCENE/arcene_{data_type}.data").decode(
"utf-8"
)
),
header=None,
sep=" ",
).drop(columns=list(range(1998, 10001)))
)
labels.append(
pd.read_csv(
StringIO(
zipped_data.read(
("ARCENE/" if data_type == "train" else "")
+ f"arcene_{data_type}.labels"
).decode("utf-8")
),
header=None,
).rename(columns={0: "target"})
)
data = pd.concat(data, ignore_index=True)
labels = pd.concat(labels, ignore_index=True)
data = pd.concat([data, labels], axis=1)
data["target"] = data["target"].isin([1]).astype(int)
return data
[docs]
def fetch_audit(self):
"""Download and transform the Audit Data Set.
https://archive.ics.uci.edu/ml/datasets/Audit+Data
"""
zipped_data = requests.get(FETCH_URLS["audit"]).content
unzipped_data = (
ZipFile(BytesIO(zipped_data))
.read("audit_data/audit_risk.csv")
.decode("utf-8")
)
data = pd.read_csv(StringIO(sub(r"@.+\n+", "", unzipped_data)), engine="python")
data = (
data.drop(columns=["LOCATION_ID"])
.rename(columns={"Risk": "target"})
.dropna()
)
return data
[docs]
def fetch_spambase(self):
"""Download and transform the Spambase Data Set.
https://archive.ics.uci.edu/ml/datasets/Spambase
"""
data = pd.read_csv(FETCH_URLS["spambase"], header=None)
data.rename(columns={57: "target"}, inplace=True)
return data
[docs]
def fetch_parkinsons(self):
"""Download and transform the Parkinsons Data Set.
https://archive.ics.uci.edu/ml/datasets/parkinsons
"""
data = pd.read_csv(FETCH_URLS["parkinsons"])
data = pd.concat(
[
data.drop(columns=["name", "status"]),
data[["status"]].rename(columns={"status": "target"}),
],
axis=1,
)
data["target"] = data["target"].isin([0]).astype(int)
return data
[docs]
def fetch_ionosphere(self):
"""Download and transform the Ionosphere Data Set.
https://archive.ics.uci.edu/ml/datasets/ionosphere
"""
data = pd.read_csv(FETCH_URLS["ionosphere"], header=None)
data = data.drop(columns=[0, 1]).rename(columns={34: "target"})
data["target"] = data["target"].isin(["b"]).astype(int)
return data
[docs]
def fetch_breast_cancer(self):
"""Download and transform the Breast Cancer Wisconsin Data Set.
https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)
"""
data = pd.read_csv(FETCH_URLS["breast_cancer"], header=None)
data = pd.concat(
[data.drop(columns=[0, 1]), data[[1]].rename(columns={1: "target"})], axis=1
)
data["target"] = data["target"].isin(["M"]).astype(int)
return data