Source code for mlresearch.utils._data

"""
Data I/O utils. Later on I might add other data handling utilities.
"""

from os import listdir
from os.path import isdir, join
import pandas as pd
from sqlite3 import connect



[docs]
def load_datasets(
    data_dir, prefix="", suffix="", target_exists=True, **read_csv_kwargs
):
    """
    Load all datasets in a directory from sqlite databases and/or csv files.

    Parameters
    ----------
    data_dir : str
        Data directory to be crawled.

    prefix : str, default=''
        Load dataset if the file starts with the specified prefix.

    suffix : str, default=''
        Load dataset if the file starts with the specified suffix.

    target_exists : bool, default=True
        Specify wether there is a target feature. If True, it is assumed to be in the
        last position of the dataset.

    Returns
    -------
    datasets : list
        A list with nested tuples with structure (dataset_name, (X, y)).
    """
    assert isdir(data_dir), "`data_dir` must be a directory."

    # Filter data by suffix
    dat_names = [
        dat
        for dat in listdir(data_dir)
        if (dat.startswith(prefix) and dat.endswith(suffix))
    ]

    # Read data
    datasets = []
    for dat_name in dat_names:
        data_path = join(data_dir, dat_name)

        # Handle csv data
        if dat_name.endswith(".csv"):
            ds = pd.read_csv(data_path, **read_csv_kwargs)
            name = dat_name.replace(".csv", "").replace("_", " ").upper()
            if target_exists:
                ds = (ds.iloc[:, :-1], ds.iloc[:, -1])
            datasets.append((name, ds))

        # Handle sqlite database
        elif dat_name.endswith(".db"):
            with connect(data_path) as connection:
                datasets_names = [
                    name[0]
                    for name in connection.execute(
                        "SELECT name FROM sqlite_master WHERE type='table';"
                    )
                ]
                for dataset_name in datasets_names:
                    ds = pd.read_sql(f'select * from "{dataset_name}"', connection)
                    if target_exists:
                        ds = (ds.iloc[:, :-1], ds.iloc[:, -1])
                    datasets.append((dataset_name.replace("_", " ").upper(), ds))
    return datasets