Source code for auswahl._fipls

from typing import Union, List, Dict

import numpy as np
from joblib import Parallel, delayed
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted

from ._base import IntervalSelector


[docs]class FiPLS(IntervalSelector): """Feature Selection with Forward interval Partial Least Squares (FiPLS). The FiPLS method has been described in Xiaobo et al. [1]_. This implementation deviates from the original description as it allows to select intervals at arbitrary positions. Read more in the :ref:`User Guide <fipls>`. Parameters ---------- n_intervals_to_select : int, default=None Number of intervals to select. interval_width : int or float, default=None Number of features that form an interval pls : PLSRegression, default=None Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this to adjust the hyperparameters of the PLS method. n_cv_folds : int, default=10 Number of cross validation folds used to evaluate intervals n_jobs : int, default=1 Number of parallel processes that fit PLS models on the different intervals Attributes ---------- support_ : ndarray of shape (n_features,) Mask of selected features. References ---------- .. [1] Zou Xiaobo, Zhao Jiewen, Li Yanxiao, 'Selection of the efficient wavelength regions in FT-NIR spectroscopy for determination of SSC of ‘Fuji’ apple based on BiPLS and FiPLS models', Vibrational Spectroscopy, vol. 44, no. 2, 220--227, 2007. Examples -------- >>> import numpy as np >>> from auswahl import FiPLS >>> X = np.random.randn(100, 10) >>> y = 5 * X[:, 0] - 4 * X[:,1] - 2 * X[:, 5] + 3 * X[:,6] # y depends on two intervals >>> selector = FiPLS(n_intervals_to_select=2, interval_width=2) >>> selector.fit(X, y).get_support() array([ True, True, False, False, False, True, True, False, False, False]) """
[docs] def __init__(self, n_intervals_to_select: int = 1, interval_width: Union[int, float] = None, pls: PLSRegression = None, n_cv_folds: int = 10, model_hyperparams: Union[Dict, List[Dict]] = None, n_jobs: int = 1): super().__init__(n_intervals_to_select, interval_width, n_cv_folds=n_cv_folds, model_hyperparams=model_hyperparams, n_jobs=n_jobs) self.pls = pls self.n_cv_folds = n_cv_folds
def _fit(self, X, y, n_intervals_to_select, interval_width): selection = np.zeros(X.shape[1], dtype=bool) with Parallel(n_jobs=self.n_jobs) as parallel: for n in range(n_intervals_to_select): x_selected = X[:, selection] x_free = X[:, ~selection] free_idx = np.arange(X.shape[1])[~selection] evaluations = parallel(delayed(self.evaluate) (np.concatenate([x_selected, x_free[:, i:i + interval_width]], axis=1), y, self.pls) for i in range(len(free_idx) - interval_width + 1)) scores, models = list(zip(*evaluations)) best_idx = np.argmax(scores) selection[free_idx[best_idx]:free_idx[best_idx + interval_width - 1] + 1] = 1 self.best_model_ = models[best_idx] self.support_ = selection return self