Source code for research.utils._data
"""
Data I/O utils. Later on I might add other data handling utilities.
"""
from os import listdir
from os.path import isdir, join
import pandas as pd
from sqlite3 import connect
[docs]def load_datasets(data_dir, suffix='', target_exists=True, **read_csv_kwargs):
"""Load datasets from sqlite database and/or csv files."""
assert isdir(data_dir), '`data_dir` must be a directory.'
# Filter data by suffix
dat_names = [
dat for dat in listdir(data_dir)
if dat.endswith(suffix)
]
# Read data
datasets = []
for dat_name in dat_names:
data_path = join(data_dir, dat_name)
# Handle csv data
if dat_name.endswith('.csv'):
ds = pd.read_csv(data_path, **read_csv_kwargs)
name = dat_name.replace('.csv', '').replace('_', ' ').upper()
if target_exists:
ds = (ds.iloc[:, :-1], ds.iloc[:, -1])
datasets.append((name, ds))
# Handle sqlite database
elif dat_name.endswith('.db'):
with connect(data_path) as connection:
datasets_names = [
name[0]
for name in connection.execute(
"SELECT name FROM sqlite_master WHERE type='table';"
)
]
for dataset_name in datasets_names:
ds = pd.read_sql(
f'select * from "{dataset_name}"', connection
)
if target_exists:
ds = (ds.iloc[:, :-1], ds.iloc[:, -1])
datasets.append(
(dataset_name.replace('_', ' ').upper(), ds)
)
return datasets