Source code for research.utils._utils

# Author: Georgios Douzas <gdouzas@icloud.com>
#         Joao Fonseca <jpmrfonseca@gmail.com>
# License: MIT

import numpy as np
import pandas as pd
from os.path import join, dirname, abspath


[docs]def generate_mean_std_tbl(mean_vals, std_vals): """Generate table that combines mean and sem values.""" index = mean_vals.iloc[:, :2] scores = ( mean_vals.iloc[:, 2:].applymap("{:,.2f}".format) + r" $\pm$ " + std_vals.iloc[:, 2:].applymap("{:,.2f}".format) ) tbl = pd.concat([index, scores], axis=1) return tbl
[docs]def generate_pvalues_tbl(tbl): """Format p-values.""" for name in tbl.dtypes[tbl.dtypes == float].index: tbl[name] = tbl[name].apply(lambda pvalue: "%.1e" % pvalue) return tbl
[docs]def sort_tbl(tbl, ds_order=None, ovrs_order=None, clfs_order=None, metrics_order=None): """ Sort tables rows and columns. Mostly used to format results from oversampling experiments. """ cols = tbl.columns keys = ["Dataset", "Oversampler", "Classifier", "Metric"] for key, cat in zip(keys, (ds_order, ovrs_order, clfs_order, metrics_order)): if key in cols: tbl[key] = pd.Categorical(tbl[key], categories=cat) key_cols = [col for col in cols if col in keys] tbl.sort_values(key_cols, inplace=True) if ovrs_order is not None and set(ovrs_order).issubset(cols): tbl = tbl[key_cols + list(ovrs_order)] return tbl
[docs]def generate_paths(filepath): """ Generate data, results and analysis paths. """ prefix_path = join(dirname(abspath(filepath)), "..") paths = [join(prefix_path, name) for name in ("data", "results", "analysis")] return paths
[docs]def make_bold(row, maximum=True, num_decimals=2, threshold=None, with_sem=False): """ Make bold the lowest or highest value(s). with_sem simply returns an incomplete textbf latex function. """ row = round(row, num_decimals) if threshold is None: val = row.max() if maximum else row.min() mask = row == val else: mask = (row > threshold) if maximum else (row < threshold) formatter = "{0:.%sf}" % num_decimals row = row.apply(lambda el: formatter.format(el)) row[mask] = [ "\\textbf{%s" % formatter.format(v) if with_sem else "\\textbf{%s}" % formatter.format(v) for v in row[mask].astype(float) ] # Return mask only if function is being used to generate # a table with sem values if with_sem: return row, mask else: return row
[docs]def generate_mean_std_tbl_bold( mean_vals, std_vals, maximum=True, decimals=2, threshold=None ): """ Generate table that combines mean and sem values. """ mean_bold = mean_vals.apply( lambda row: make_bold(row, maximum, decimals, threshold, with_sem=True)[0], axis=1, ) mask = mean_vals.apply( lambda row: make_bold(row, maximum, decimals, threshold, with_sem=True)[1], axis=1, ).values formatter = "{0:.%sf}" % decimals std_bold = std_vals.applymap(lambda x: formatter.format(x)) std_bold = np.where(mask, std_bold + "}", std_bold) scores = mean_bold + r" $\pm$ " + std_bold return scores