Source code for mlresearch.utils._data

"""
Data I/O utils. Later on I might add other data handling utilities.
"""

from os import listdir
from os.path import isdir, join
import pandas as pd
from sqlite3 import connect


[docs] def load_datasets( data_dir, prefix="", suffix="", target_exists=True, **read_csv_kwargs ): """ Load all datasets in a directory from sqlite databases and/or csv files. Parameters ---------- data_dir : str Data directory to be crawled. prefix : str, default='' Load dataset if the file starts with the specified prefix. suffix : str, default='' Load dataset if the file starts with the specified suffix. target_exists : bool, default=True Specify wether there is a target feature. If True, it is assumed to be in the last position of the dataset. Returns ------- datasets : list A list with nested tuples with structure (dataset_name, (X, y)). """ assert isdir(data_dir), "`data_dir` must be a directory." # Filter data by suffix dat_names = [ dat for dat in listdir(data_dir) if (dat.startswith(prefix) and dat.endswith(suffix)) ] # Read data datasets = [] for dat_name in dat_names: data_path = join(data_dir, dat_name) # Handle csv data if dat_name.endswith(".csv"): ds = pd.read_csv(data_path, **read_csv_kwargs) name = dat_name.replace(".csv", "").replace("_", " ").upper() if target_exists: ds = (ds.iloc[:, :-1], ds.iloc[:, -1]) datasets.append((name, ds)) # Handle sqlite database elif dat_name.endswith(".db"): with connect(data_path) as connection: datasets_names = [ name[0] for name in connection.execute( "SELECT name FROM sqlite_master WHERE type='table';" ) ] for dataset_name in datasets_names: ds = pd.read_sql(f'select * from "{dataset_name}"', connection) if target_exists: ds = (ds.iloc[:, :-1], ds.iloc[:, -1]) datasets.append((dataset_name.replace("_", " ").upper(), ds)) return datasets