Source code for research.utils._data
"""
Data I/O utils. Later on I might add other data handling utilities.
"""
from os import listdir
from os.path import isdir, join
import pandas as pd
from sqlite3 import connect
[docs]def load_datasets(data_dir, suffix="", target_exists=True, **read_csv_kwargs):
"""Load datasets from sqlite database and/or csv files."""
assert isdir(data_dir), "`data_dir` must be a directory."
# Filter data by suffix
dat_names = [dat for dat in listdir(data_dir) if dat.endswith(suffix)]
# Read data
datasets = []
for dat_name in dat_names:
data_path = join(data_dir, dat_name)
# Handle csv data
if dat_name.endswith(".csv"):
ds = pd.read_csv(data_path, **read_csv_kwargs)
name = dat_name.replace(".csv", "").replace("_", " ").upper()
if target_exists:
ds = (ds.iloc[:, :-1], ds.iloc[:, -1])
datasets.append((name, ds))
# Handle sqlite database
elif dat_name.endswith(".db"):
with connect(data_path) as connection:
datasets_names = [
name[0]
for name in connection.execute(
"SELECT name FROM sqlite_master WHERE type='table';"
)
]
for dataset_name in datasets_names:
ds = pd.read_sql(f'select * from "{dataset_name}"', connection)
if target_exists:
ds = (ds.iloc[:, :-1], ds.iloc[:, -1])
datasets.append((dataset_name.replace("_", " ").upper(), ds))
return datasets