Source code for research.utils._data

"""
Data I/O utils. Later on I might add other data handling utilities.
"""
from os import listdir
from os.path import isdir, join
import pandas as pd
from sqlite3 import connect


[docs]def load_datasets(data_dir, suffix="", target_exists=True, **read_csv_kwargs): """Load datasets from sqlite database and/or csv files.""" assert isdir(data_dir), "`data_dir` must be a directory." # Filter data by suffix dat_names = [dat for dat in listdir(data_dir) if dat.endswith(suffix)] # Read data datasets = [] for dat_name in dat_names: data_path = join(data_dir, dat_name) # Handle csv data if dat_name.endswith(".csv"): ds = pd.read_csv(data_path, **read_csv_kwargs) name = dat_name.replace(".csv", "").replace("_", " ").upper() if target_exists: ds = (ds.iloc[:, :-1], ds.iloc[:, -1]) datasets.append((name, ds)) # Handle sqlite database elif dat_name.endswith(".db"): with connect(data_path) as connection: datasets_names = [ name[0] for name in connection.execute( "SELECT name FROM sqlite_master WHERE type='table';" ) ] for dataset_name in datasets_names: ds = pd.read_sql(f'select * from "{dataset_name}"', connection) if target_exists: ds = (ds.iloc[:, :-1], ds.iloc[:, -1]) datasets.append((dataset_name.replace("_", " ").upper(), ds)) return datasets