Source code for mlresearch.datasets._multiclass

"""
Download, transform and simulate various datasets.
"""

# Author: Joao Fonseca <jpfonseca@novaims.unl.pt>
# License: MIT

import os
from os.path import join
from urllib.parse import urljoin
from string import ascii_lowercase

from io import BytesIO, StringIO
from zipfile import ZipFile
import tarfile
import requests

import numpy as np
import pandas as pd

from .base import Datasets, get_data_home, FETCH_URLS
from ..utils._utils import _optional_import


[docs] class ContinuousCategoricalDatasets(Datasets): """Class to download, transform and save datasets with both continuous and categorical features.""" @staticmethod def _modify_columns(data, categorical_features): """Rename and reorder columns of dataframe.""" X_metric, X_cat, y = ( data.drop(columns=categorical_features + ["target"]), data[categorical_features], data.target, ) X_metric.columns = range(len(X_metric.columns)) X_cat.columns = [f"cat_{i}" for i in range(len(X_cat.columns))] return pd.concat([X_metric, X_cat, y], axis=1)
[docs] def download(self): """Download the datasets.""" self.data_home_ = get_data_home(data_home=self.data_home) dataset_prefix = self.__class__.__name__.lower().replace("datasets", "") # Get datasets to download if self.names == "all": func_names = [func_name for func_name in dir(self) if "fetch_" in func_name] else: func_names = [ f"fetch_{name}".lower().replace(" ", "_") for name in self.names ] # Download datasets try: tqdm = _optional_import("tqdm.auto").tqdm iterable = tqdm(func_names, desc="Datasets") except ImportError: iterable = func_names self.content_ = [] for func_name in iterable: dat_name = func_name.replace("fetch_", "") name = dat_name.upper().replace("_", " ") file_name = f"{dataset_prefix}_{dat_name}.csv" if ( file_name not in os.listdir(self.data_home_) and self.download_if_missing ): df, categorical_features = getattr(self, func_name)() df = self._modify_columns(df, list(categorical_features)) df.to_csv(join(self.data_home_, file_name), index=False) data = pd.read_csv(join(self.data_home_, file_name)) self.content_.append((name, data)) return self
[docs] def summarize_datasets(self): """ Create a summary of the downloaded datasets. Returns ------- datasets_summary : pd.DataFrame Dataframe with summary statistics of all datasets. """ datasets_summary = super( ContinuousCategoricalDatasets, self ).summarize_datasets() columns = datasets_summary.columns.tolist() # Define summary table columns and empty list summary_columns = ["Metric", "Non-Metric"] extended_summary = [] # Populate empty list for name, dataset in self.content_: dataset = dataset.drop(columns="target") values = [ name, sum(~dataset.columns.str.startswith("cat_")), sum(dataset.columns.str.startswith("cat_")), ] extended_summary.append(values) extended_summary = pd.DataFrame( extended_summary, columns=["Dataset name"] + summary_columns ).set_index("Dataset name") datasets_summary = pd.concat([datasets_summary, extended_summary], axis=1) # Reorder columns index = columns.index("Features") + 1 columns = [*columns[:index], *summary_columns, *columns[index:]] return datasets_summary[columns]
[docs] def fetch_adult(self): """Download and transform the Adult Data Set. https://archive.ics.uci.edu/ml/datasets/Adult """ data = pd.read_csv(FETCH_URLS["adult"], header=None, na_values=" ?").dropna() data.rename(columns={data.columns[-1]: "target"}, inplace=True) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) categorical_features = [1, 3, 5, 6, 7, 8, 9, 13] # Trim spaces in categorical features data.iloc[:, categorical_features] = data.iloc[:, categorical_features].map( lambda x: x.strip() ) return data, categorical_features
[docs] def fetch_abalone(self): """Download and transform the Abalone Data Set. https://archive.ics.uci.edu/ml/datasets/Abalone """ data = pd.read_csv(FETCH_URLS["abalone"], header=None) data.rename(columns={data.columns[-1]: "target"}, inplace=True) categorical_features = [0] return data, categorical_features
[docs] def fetch_acute(self): """Download and transform the Acute Inflammations Data Set. https://archive.ics.uci.edu/ml/datasets/Acute+Inflammations """ data = pd.read_csv( FETCH_URLS["acute"], header=None, sep="\t", decimal=",", encoding="UTF-16" ) data["target"] = data[6].str[0] + data[7].str[0] data.drop(columns=[6, 7], inplace=True) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) categorical_features = list(range(1, 6)) return data, categorical_features
[docs] def fetch_annealing(self): """Download and transform the Annealing Data Set. https://archive.ics.uci.edu/ml/datasets/Annealing """ data = pd.read_csv(FETCH_URLS["annealing"], header=None, na_values="?") # some features are dropped; they have too many missing values missing_feats = (data.isnull().sum(0) / data.shape[0]) < 0.1 data = data.iloc[:, missing_feats.values] data[2].fillna(data[2].mode().squeeze(), inplace=True) data = data.T.reset_index(drop=True).T data.rename(columns={data.columns[-1]: "target"}, inplace=True) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) categorical_features = [0, 1, 5, 9] return data, categorical_features
[docs] def fetch_census(self): """Download and transform the Census-Income (KDD) Data Set. https://archive.ics.uci.edu/dataset/117/census+income+kdd """ zipped_data = requests.get(FETCH_URLS["census"]).content zipped_data = ZipFile(BytesIO(zipped_data)).read("census.tar.gz") zipped_data = tarfile.open(fileobj=BytesIO(zipped_data)) data = pd.read_csv( StringIO( zipped_data.extractfile("census-income.data").read().decode("utf-8") ), header=None, ) categorical_features = ( list(range(1, 5)) + list(range(6, 16)) + list(range(19, 29)) + list(range(30, 38)) + [39] ) # some features are dropped; they have too many missing values cols_ids = [1, 6, 9, 13, 14, 20, 21, 29, 31, 37] categorical_features = np.argwhere( np.delete( data.rename(columns={k: f"nom_{k}" for k in categorical_features}) .columns.astype("str") .str.startswith("nom_"), cols_ids, ) ).squeeze() data = data.drop(columns=cols_ids).T.reset_index(drop=True).T # some rows are dropped; they have rare missing values data = data.iloc[data.map(lambda x: x != " Not in universe").all(1).values, :] data.rename(columns={data.columns[-1]: "target"}, inplace=True) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) return data, categorical_features
[docs] def fetch_contraceptive(self): """Download and transform the Contraceptive Method Choice Data Set. https://archive.ics.uci.edu/ml/datasets/Contraceptive+Method+Choice """ data = pd.read_csv(FETCH_URLS["contraceptive"], header=None) data.rename(columns={data.columns[-1]: "target"}, inplace=True) categorical_features = [4, 5, 6, 8] return data, categorical_features
[docs] def fetch_covertype(self): """Download and transform the Covertype Data Set. https://archive.ics.uci.edu/ml/datasets/Covertype """ data = pd.read_csv(FETCH_URLS["covertype"], header=None) data.rename(columns={data.columns[-1]: "target"}, inplace=True) wilderness_area = pd.Series( np.argmax(data.iloc[:, 10:14].values, axis=1), name=10 ) soil_type = pd.Series(np.argmax(data.iloc[:, 14:54].values, axis=1), name=11) data = ( data.drop(columns=list(range(10, 54))) .join(wilderness_area) .join(soil_type)[list(range(0, 12)) + ["target"]] ) categorical_features = [10, 11] return data, categorical_features
[docs] def fetch_credit_approval(self): """Download and transform the Credit Approval Data Set. https://archive.ics.uci.edu/ml/datasets/Credit+Approval """ data = pd.read_csv( FETCH_URLS["credit_approval"], header=None, na_values="?" ).dropna() data.rename(columns={data.columns[-1]: "target"}, inplace=True) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) categorical_features = [0, 3, 4, 5, 6, 8, 9, 11, 12] return data, categorical_features
[docs] def fetch_dermatology(self): """Download and transform the Dermatology Data Set. https://archive.ics.uci.edu/ml/datasets/Dermatology """ data = pd.read_csv( FETCH_URLS["dermatology"], header=None, na_values="?" ).dropna() data.rename(columns={data.columns[-1]: "target"}, inplace=True) categorical_features = list(range(data.shape[1] - 1)) categorical_features.remove(33) return data, categorical_features
[docs] def fetch_echocardiogram(self): """Download and transform the Echocardiogram Data Set. https://archive.ics.uci.edu/ml/datasets/Echocardiogram """ data = pd.read_csv( FETCH_URLS["echocardiogram"], header=None, on_bad_lines="skip", na_values="?", ) data.drop(columns=[10, 11], inplace=True) data.dropna(inplace=True) data.rename(columns={data.columns[-1]: "target"}, inplace=True) categorical_features = [1, 3] return data, categorical_features
[docs] def fetch_flags(self): """Download and transform the Flags Data Set. https://archive.ics.uci.edu/ml/datasets/Flags """ data = pd.read_csv(FETCH_URLS["flags"], header=None) target = data[6].rename("target") data = data.drop(columns=[0, 6]).T.reset_index(drop=True).T.join(target) categorical_features = [ 0, 1, 4, 8, 9, 10, 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 26, 27, ] return data, categorical_features
[docs] def fetch_heart_disease(self): """Download and transform the Heart Disease Data Set. https://archive.ics.uci.edu/ml/datasets/Heart+Disease """ data = ( pd.concat( [ pd.read_csv(url, header=None, na_values="?") for url in FETCH_URLS["heart_disease"] ], ignore_index=True, ) .drop(columns=[10, 11, 12]) .dropna() ) data.rename(columns={data.columns[-1]: "target"}, inplace=True) categorical_features = [1, 2, 5, 6, 8] return data, categorical_features
[docs] def fetch_hepatitis(self): """Download and transform the Hepatitis Data Set. https://archive.ics.uci.edu/ml/datasets/Hepatitis """ data = ( pd.read_csv(FETCH_URLS["hepatitis"], header=None, na_values="?") .drop(columns=[15, 18]) .dropna() ) target = data[0].rename("target") data = data.drop(columns=[0]).T.reset_index(drop=True).T.join(target) categorical_features = list(range(1, 13)) + [16] return data, categorical_features
[docs] def fetch_german_credit(self): """Download and transform the German Credit Data Set. https://archive.ics.uci.edu/ml/datasets/Statlog+%28German+Credit+Data%29 """ data = pd.read_csv(FETCH_URLS["german_credit"], header=None, sep=" ") data.rename(columns={data.columns[-1]: "target"}, inplace=True) categorical_features = ( np.argwhere(data.iloc[0, :-1].apply(lambda x: str(x)[0] == "A").values) .squeeze() .tolist() ) return data, categorical_features
[docs] def fetch_heart(self): """Download and transform the Heart Data Set. http://archive.ics.uci.edu/ml/datasets/statlog+(heart) """ data = pd.read_csv(FETCH_URLS["heart"], header=None, delim_whitespace=True) data.rename(columns={data.columns[-1]: "target"}, inplace=True) categorical_features = [1, 2, 5, 6, 8, 10, 12] return data, categorical_features
[docs] def fetch_thyroid(self): """Download and transform the Thyroid Disease Data Set. Label 0 corresponds to no disease found. Label 1 corresponds to one or multiple diseases found. https://archive.ics.uci.edu/ml/datasets/Thyroid+Disease """ data = ( pd.read_csv(FETCH_URLS["thyroid"], header=None, na_values="?") .drop(columns=27) .dropna() .T.reset_index(drop=True) .T ) data.rename(columns={data.columns[-1]: "target"}, inplace=True) data["target"] = ( data["target"].apply(lambda x: x.split("[")[0]) != "-" ).astype(int) categorical_features = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 22, 24, 26, 27, ] return data, categorical_features
[docs] class MultiClassDatasets(Datasets): """Class to download, transform and save multi-class datasets."""
[docs] def fetch_first_order_theorem(self): """Download and transform the First Order Theorem Data Set. https://www.openml.org/d/1475 """ data = pd.read_csv(FETCH_URLS["first_order_theorem"]) data.rename(columns={"Class": "target"}, inplace=True) return data
[docs] def fetch_gas_drift(self): """Download and transform the Gas Drift Data Set. https://www.openml.org/d/1476 """ data = pd.read_csv(FETCH_URLS["gas_drift"]) data.rename(columns={"Class": "target"}, inplace=True) return data
[docs] def fetch_autouniv_au7(self): """Download and transform the AutoUniv au7 Data Set https://www.openml.org/d/1552 """ data = pd.read_csv(FETCH_URLS["autouniv_au7"]) data.rename(columns={"Class": "target"}, inplace=True) data.target = data.target.apply(lambda x: x.replace("class", "")).astype(int) mask = (data.iloc[:, :-1].nunique() > 10).tolist() mask.append(True) data = data.loc[:, mask].copy() return data
[docs] def fetch_autouniv_au4(self): """Download and transform the AutoUniv au4 Data Set https://www.openml.org/d/1548 """ data = pd.read_csv(FETCH_URLS["autouniv_au4"]) data.rename(columns={"Class": "target"}, inplace=True) data.target = data.target.apply(lambda x: x.replace("class", "")).astype(int) mask = (data.iloc[:, :-1].nunique() > 10).tolist() mask.append(True) data = data.loc[:, mask].copy() return data
[docs] def fetch_mice_protein(self): """Download and transform the Mice Protein Data Set https://www.openml.org/d/40966 """ data = pd.read_csv(FETCH_URLS["mice_protein"]) data.rename(columns={"class": "target"}, inplace=True) data.drop(columns=["MouseID"], inplace=True) data.replace("?", np.nan, inplace=True) mask = (data.iloc[:, :-1].nunique() > 10).tolist() mask.append(True) mask2 = data.isna().sum() < 10 data = data.loc[:, mask & mask2].dropna().copy() data.iloc[:, :-1] = data.iloc[:, :-1].astype(float) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) return data
[docs] def fetch_steel_plates(self): """Download and transform the Steel Plates Fault Data Set. https://www.openml.org/d/40982 """ data = pd.read_csv(FETCH_URLS["steel_plates"]) mask = (data.iloc[:, :-1].nunique() > 10).tolist() mask.append(True) data = data.loc[:, mask].copy() mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) return data
[docs] def fetch_cardiotocography(self): """Download and transform the Cardiotocography Data Set. https://www.openml.org/d/1560 """ data = pd.read_csv(FETCH_URLS["cardiotocography"]) data.rename(columns={"Class": "target"}, inplace=True) mask = (data.iloc[:, :-1].nunique() > 10).tolist() mask.append(True) data = data.loc[:, mask].copy() return data
[docs] def fetch_waveform(self): """Download and transform the Waveform Database Generator (version 2) Data Set. https://www.openml.org/d/60 """ data = pd.read_csv(FETCH_URLS["waveform"]) data.rename(columns={"class": "target"}, inplace=True) return data
[docs] def fetch_volkert(self): """Download and transform the Volkert Data Set. https://www.openml.org/d/41166 """ data = pd.read_csv(FETCH_URLS["volkert"]) data.rename(columns={"class": "target"}, inplace=True) mask = (data.iloc[:, 1:].nunique() > 100).tolist() mask.insert(0, True) data = data.loc[:, mask].copy() return data
[docs] def fetch_vehicle(self): """Download and transform the Vehicle Silhouettes Data Set. https://archive.ics.uci.edu/ml/datasets/Statlog+(Vehicle+Silhouettes) """ data = [] for letter in ascii_lowercase[0:9]: partial_data = pd.read_csv( urljoin( FETCH_URLS["vehicle"].replace("Index", ""), "xa%s.dat" % letter ), header=None, delim_whitespace=True, ) partial_data = partial_data.rename(columns={18: "target"}) data.append(partial_data) data = pd.concat(data) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) return data
[docs] def fetch_asp_potassco(self): """Download and transform the ASP-POTASSCO Data Set. https://www.openml.org/d/41705 """ data = pd.read_csv(FETCH_URLS["asp_potassco"], na_values="?") data.dropna(inplace=True) data["target"] = data["algorithm"] data.drop(columns=["instance_id", "algorithm"], inplace=True) mask = (data.iloc[:, :-1].nunique() > 100).tolist() mask.append(True) data = data.loc[:, mask].copy() mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) return data
[docs] def fetch_wine_quality(self): """Download and transform the Wine Quality Data Set. https://www.openml.org/d/40691 """ data = pd.read_csv(FETCH_URLS["wine_quality"]) data.rename(columns={"class": "target"}, inplace=True) return data
[docs] def fetch_mfeat_zernike(self): """Download and transform the Multiple Features Dataset: Zernike Data Set. https://www.openml.org/d/22 """ data = pd.read_csv(FETCH_URLS["mfeat_zernike"]) data.drop_duplicates(inplace=True) data.rename(columns={"class": "target"}, inplace=True) return data
[docs] def fetch_gesture_segmentation(self): """Download and transform the Gesture Phase Segmentation Data Set. https://www.openml.org/d/4538 """ data = pd.read_csv(FETCH_URLS["gesture_segmentation"]) data.rename(columns={"Phase": "target"}, inplace=True) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) return data
[docs] def fetch_texture(self): """Download and transform the Texture Data Set. https://www.openml.org/d/40499 """ data = pd.read_csv(FETCH_URLS["texture"]) data.drop_duplicates(inplace=True) data.rename(columns={"Class": "target"}, inplace=True) return data
[docs] def fetch_usps(self): """Download and transform the USPS Data Set. https://www.openml.org/data/get_csv/19329737/usps.arff """ data = pd.read_csv(FETCH_URLS["usps"]) data.rename(columns={"int0": "target"}, inplace=True) return data
[docs] def fetch_vowels(self): """Download and transform the Vowels Data Set. https://www.openml.org/d/375 """ data = pd.read_csv(FETCH_URLS["vowels"]) data.rename(columns={"speaker": "target"}, inplace=True) data.drop(columns=["utterance", "frame"], inplace=True) return data
[docs] def fetch_pendigits(self): """Download and transform the Pen-Based Recognition of Handwritten Digits Data Set. https://www.openml.org/d/32 """ data = pd.read_csv(FETCH_URLS["pendigits"]) data.rename(columns={"class": "target"}, inplace=True) return data
[docs] def fetch_image_segmentation(self): """Download and transform the Image Segmentation Data Set. https://www.openml.org/d/40984 """ data = pd.read_csv(FETCH_URLS["image_segmentation"]) data.drop(columns=data.columns[:5], inplace=True) data.rename(columns={"class": "target"}, inplace=True) mapper = {v: k for k, v in enumerate(data.target.unique())} data.target = data.target.map(mapper) return data
[docs] def fetch_baseball(self): """Download and transform the Baseball Hall of Fame Data Set. https://www.openml.org/d/185 """ data = pd.read_csv(FETCH_URLS["baseball"], na_values="?") data.drop(columns=["Player", "Position"], inplace=True) data.rename(columns={"Hall_of_Fame": "target"}, inplace=True) data.dropna(inplace=True) return data