Source code for mlresearch.utils._check_pipelines

from itertools import product
from sklearn.base import clone
from sklearn.utils import check_random_state
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import ParameterGrid
from imblearn.utils import Substitution
from imblearn.pipeline import Pipeline
from imblearn.utils._docstring import _random_state_docstring


[docs] @Substitution( random_state=_random_state_docstring, ) def check_random_states(random_state, n_runs): """ Create random states for experiments. Used to create seeds for different initializations. Parameters ---------- {random_state} n_runs : int Number of initializations. Returns ------- random_states : list A list of random states with length ``n_runs``. """ random_state = check_random_state(random_state) return [random_state.randint(0, 2**32 - 1, dtype="uint32") for _ in range(n_runs)]
[docs] @Substitution( random_state=_random_state_docstring, ) def check_pipelines(*objects_list, random_state, n_runs): """ Extract estimators and parameter grids to be passed to ModelSearchCV. This enables searching over any sequence of parameter settings and objects. Parameters ---------- *objects_list : sequence of lists Lists of objects to be chained in a pipeline in the passed order. Each list must contain tuples composed of (``<obj_name>``, ``<object>``, ``<parameter_values_dict>``). {random_state} n_runs : int Number of initializations. Returns ------- estimators : List of Pipelines with all combinations among the passed lists of objects. param_grids : List of dictionaries with estimator and parameter names (``str``) as keys and lists of parameter settings to try as values. """ # Create random states random_states = check_random_states(random_state, n_runs) pipelines = [] param_grid = [] for comb, rs in product(product(*objects_list), random_states): name = "|".join([i[0] for i in comb]) # name, object, sub grid comb = [ ( (nm, ob, ParameterGrid(sg)) if ob is not None else (nm, FunctionTransformer(), ParameterGrid(sg)) ) for nm, ob, sg in comb ] # Create estimator if name not in [n[0] for n in pipelines]: est = Pipeline([(nm, ob) for nm, ob, _ in comb]) pipelines.append((name, est)) # Create intermediate parameter grids sub_grids = [ [{f"{nm}__{k}": v for k, v in param_def.items()} for param_def in sg] for nm, obj, sg in comb ] # Create parameter grids for sub_grid in product(*sub_grids): param_prefix = f"{name}__" grid = {"est_name": [name]} grid.update( {f"{param_prefix}{k}": [v] for d in sub_grid for k, v in d.items()} ) random_states = { f"{param_prefix}{param}": [rs] for param in est.get_params() if "random_state" in param } grid.update(random_states) # Avoid multiple runs over pipelines without random state if grid not in param_grid: param_grid.append(grid) return pipelines, param_grid
[docs] def check_pipelines_wrapper( *objects_list, wrapper, random_state, n_runs, estimator_param="classifier", wrapped_only=True, ): """ Extract estimators within a wrapper object and parameter grids to be passed to ModelSearchCV. This enables searching over any sequence of parameter settings and objects. Parameters ---------- *objects_list : sequence of lists Lists of objects to be chained in a pipeline in the passed order. Each list must contain tuples composed of (``<obj_name>``, ``<object>``, ``<parameter_values_dict>``). wrapper : tuple or tuple Wrapper object to which the lists of objects will be passed. Must be structured as (``<obj_name>``, ``<object>``, ``<parameter_values_dict>``) and . {random_state} n_runs : int Number of initializations. estimator_param : str, default="classifier" Name of the parameter in the wrapper object where the estimators will be passed. wrapped_only : bool, default=True Return only the wrapped estimators. If ``False``, returns both the wrapped and the original objects. Returns ------- wrapped_estimators : List of Pipelines with all combinations among the passed lists of objects. wrapped_param_grids : List of dictionaries with estimator and parameter names (``str``) as keys and lists of parameter settings to try as values. """ wrapper_label = wrapper[0] wrapper_obj = wrapper[1] wrapper_grid = wrapper[2] estimators, param_grids = check_pipelines( *objects_list, random_state=random_state, n_runs=n_runs ) wrapped_estimators = [ ( f"{wrapper_label}|{name}", clone(wrapper_obj).set_params(**{estimator_param: pipeline}), ) for name, pipeline in estimators ] def _format_param(param): return "__".join(param.split("__")[1:]) wrapped_param_grids = [ { "est_name": [f'{wrapper_label}|{d["est_name"][0]}'], **{ f'{wrapper_label}|{d["est_name"][0]}__{estimator_param}__' + f"{_format_param(k)}": v for k, v in d.items() if k != "est_name" }, **{ f'{wrapper_label}|{d["est_name"][0]}__{k}': v for k, v in wrapper_grid.items() }, } for d in param_grids ] if wrapped_only: return wrapped_estimators, wrapped_param_grids else: return (estimators + wrapped_estimators, param_grids + wrapped_param_grids)
def check_param_grids(param_grids, est_names): """Check the parameters grids to use with parametrized estimators.""" # Check the parameters grids flat_param_grids = [ param_grid for param_grid in list(ParameterGrid(param_grids)) if param_grid ] # Append existing estimators names param_grids = [] for param_grid in flat_param_grids: # Get estimator name est_name = param_grid.pop("est_name", None) # Modify values param_grid = {param: [val] for param, val in param_grid.items()} # Check estimators prefixes params_prefixes = set([param.split("__")[0] for param in param_grid.keys()]) if not params_prefixes.issubset(est_names): raise ValueError( "Parameters prefixes are not subset of parameter `est_names`." ) if len(params_prefixes) > 1: raise ValueError("Parameters prefixes are not unique.") if est_name is not None and len(params_prefixes.union([est_name])) > 1: raise ValueError( "Parameters prefixes and parameter `est_name` are not unique." ) param_grid["est_name"] = ( [est_name] if est_name is not None else list(params_prefixes) ) # Append parameter grid param_grids.append(param_grid) # Append missing estimators names current_est_names = set([param_grid["est_name"][0] for param_grid in param_grids]) missing_est_names = set(est_names).difference(current_est_names) for est_name in missing_est_names: param_grids.append({"est_name": [est_name]}) return param_grids def _get_estimator_type(estimator): """Detect the estimator type using sklearn tags, legacy attributes, and MRO-based mixin detection.""" # Try sklearn's tag system (works on instances in sklearn >= 1.6) try: from sklearn.utils._tags import get_tags tags = get_tags(estimator) if tags.estimator_type is not None: return tags.estimator_type except (ImportError, AttributeError): pass # Try legacy _estimator_type attribute est_type = getattr(estimator, "_estimator_type", None) if est_type is not None: return est_type # Check MRO for sklearn mixins (for estimator instances) from sklearn.base import ClassifierMixin, RegressorMixin, TransformerMixin from imblearn.base import SamplerMixin mro = set(getattr(type(estimator), "__mro__", [])) if ClassifierMixin in mro: return "classifier" if RegressorMixin in mro: return "regressor" if TransformerMixin in mro: return "transformer" if SamplerMixin in mro: return "sampler" return None def check_estimator_type(estimators): """Returns the type of estimators.""" estimator_types = set() est_type_map = {} for name, estimator in estimators: est_type = _get_estimator_type(estimator) est_type_map[name] = est_type estimator_types.add(est_type) if None in estimator_types: unknown = [name for name, est_type in est_type_map.items() if est_type is None] raise ValueError( f"Could not detect estimator type for: {unknown}. " "Ensure all estimators define a valid estimator type via sklearn tags, " "mixins (ClassifierMixin, RegressorMixin, etc.), or the " "_estimator_type attribute." ) estimator_types.discard(None) if len(estimator_types) > 1: raise ValueError( f"Multiple estimator types found: {sorted(estimator_types)}. " "A single estimator type should be included." ) if len(estimator_types) == 0: raise ValueError( "No estimator type found. " "Ensure all estimators define a valid estimator type via sklearn tags, " "mixins (ClassifierMixin, RegressorMixin, etc.), or the " "_estimator_type attribute." ) return estimator_types.pop()