from itertools import product
from sklearn.base import clone
from sklearn.utils import check_random_state
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import ParameterGrid
from imblearn.utils import Substitution
from imblearn.pipeline import Pipeline
from imblearn.utils._docstring import _random_state_docstring
[docs]
@Substitution(
random_state=_random_state_docstring,
)
def check_random_states(random_state, n_runs):
"""
Create random states for experiments. Used to create seeds for different
initializations.
Parameters
----------
{random_state}
n_runs : int
Number of initializations.
Returns
-------
random_states : list
A list of random states with length ``n_runs``.
"""
random_state = check_random_state(random_state)
return [random_state.randint(0, 2**32 - 1, dtype="uint32") for _ in range(n_runs)]
[docs]
@Substitution(
random_state=_random_state_docstring,
)
def check_pipelines(*objects_list, random_state, n_runs):
"""
Extract estimators and parameter grids to be passed to ModelSearchCV. This enables
searching over any sequence of parameter settings and objects.
Parameters
----------
*objects_list : sequence of lists
Lists of objects to be chained in a pipeline in the passed order. Each list
must contain tuples composed of (``<obj_name>``, ``<object>``,
``<parameter_values_dict>``).
{random_state}
n_runs : int
Number of initializations.
Returns
-------
estimators : List of Pipelines with all combinations among the passed lists of
objects.
param_grids : List of dictionaries with estimator and parameter names (``str``) as
keys and lists of parameter settings to try as values.
"""
# Create random states
random_states = check_random_states(random_state, n_runs)
pipelines = []
param_grid = []
for comb, rs in product(product(*objects_list), random_states):
name = "|".join([i[0] for i in comb])
# name, object, sub grid
comb = [
(
(nm, ob, ParameterGrid(sg))
if ob is not None
else (nm, FunctionTransformer(), ParameterGrid(sg))
)
for nm, ob, sg in comb
]
# Create estimator
if name not in [n[0] for n in pipelines]:
est = Pipeline([(nm, ob) for nm, ob, _ in comb])
pipelines.append((name, est))
# Create intermediate parameter grids
sub_grids = [
[{f"{nm}__{k}": v for k, v in param_def.items()} for param_def in sg]
for nm, obj, sg in comb
]
# Create parameter grids
for sub_grid in product(*sub_grids):
param_prefix = f"{name}__"
grid = {"est_name": [name]}
grid.update(
{f"{param_prefix}{k}": [v] for d in sub_grid for k, v in d.items()}
)
random_states = {
f"{param_prefix}{param}": [rs]
for param in est.get_params()
if "random_state" in param
}
grid.update(random_states)
# Avoid multiple runs over pipelines without random state
if grid not in param_grid:
param_grid.append(grid)
return pipelines, param_grid
[docs]
def check_pipelines_wrapper(
*objects_list,
wrapper,
random_state,
n_runs,
estimator_param="classifier",
wrapped_only=True,
):
"""
Extract estimators within a wrapper object and parameter grids to be passed to
ModelSearchCV. This enables searching over any sequence of parameter settings and
objects.
Parameters
----------
*objects_list : sequence of lists
Lists of objects to be chained in a pipeline in the passed order. Each list
must contain tuples composed of (``<obj_name>``, ``<object>``,
``<parameter_values_dict>``).
wrapper : tuple or tuple
Wrapper object to which the lists of objects will be passed. Must be structured
as (``<obj_name>``, ``<object>``, ``<parameter_values_dict>``) and .
{random_state}
n_runs : int
Number of initializations.
estimator_param : str, default="classifier"
Name of the parameter in the wrapper object where the estimators will be passed.
wrapped_only : bool, default=True
Return only the wrapped estimators. If ``False``, returns both the wrapped and
the original objects.
Returns
-------
wrapped_estimators : List of Pipelines with all combinations among the passed lists
of objects.
wrapped_param_grids : List of dictionaries with estimator and parameter names
(``str``) as keys and lists of parameter settings to try as values.
"""
wrapper_label = wrapper[0]
wrapper_obj = wrapper[1]
wrapper_grid = wrapper[2]
estimators, param_grids = check_pipelines(
*objects_list, random_state=random_state, n_runs=n_runs
)
wrapped_estimators = [
(
f"{wrapper_label}|{name}",
clone(wrapper_obj).set_params(**{estimator_param: pipeline}),
)
for name, pipeline in estimators
]
def _format_param(param):
return "__".join(param.split("__")[1:])
wrapped_param_grids = [
{
"est_name": [f'{wrapper_label}|{d["est_name"][0]}'],
**{
f'{wrapper_label}|{d["est_name"][0]}__{estimator_param}__'
+ f"{_format_param(k)}": v
for k, v in d.items()
if k != "est_name"
},
**{
f'{wrapper_label}|{d["est_name"][0]}__{k}': v
for k, v in wrapper_grid.items()
},
}
for d in param_grids
]
if wrapped_only:
return wrapped_estimators, wrapped_param_grids
else:
return (estimators + wrapped_estimators, param_grids + wrapped_param_grids)