Source code for auswahl._bipls

from typing import Union, Dict, List

import numpy as np
from joblib import Parallel, delayed
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted

from ._base import IntervalSelector


[docs]class BiPLS(IntervalSelector): """Feature Selection with Backward interval Partial Least Squares (BiPLS). The method separates the features space into intervals of equal width and sequentially removes the worst interval. The last interval is smaller if the total number of features is not a whole multiple of the interval width. The BiPLS method has been described in Xiaobo et al. [1]_. Read more in the :ref:`User Guide <bipls>`. Parameters ---------- n_intervals_to_select : int, default=None Number of intervals to select. interval_width : int or float, default=None Number of features that form an interval. pls : PLSRegression, default=None Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this to adjust the hyperparameters of the PLS method. n_cv_folds : int, default=10 Number of cross validation folds used to evaluate intervals n_jobs : int, default=1 Number of parallel processes that fit PLS models on the different intervals Attributes ---------- support_ : ndarray of shape (n_features,) Mask of selected features. rank_ : ndarray of shape (n_features,) Relative rank of selection. The interval with the lowest relative rank has been removed first. The finally selected intervals have a relative rank of 1. References ---------- .. [1] Zou Xiaobo, Zhao Jiewen, Li Yanxiao, 'Selection of the efficient wavelength regions in FT-NIR spectroscopy for determination of SSC of ‘Fuji’ apple based on BiPLS and FiPLS models', Vibrational Spectroscopy, vol. 44, no. 2, 220--227, 2007. Examples -------- >>> import numpy as np >>> from auswahl import BiPLS >>> np.random.seed(1337) >>> X = np.random.randn(100, 10) >>> y = 5 * X[:, 0] - 4 * X[:,1] - 2 * X[:, 4] + 3 * X[:, 5] # y depends on two intervals >>> selector = BiPLS(n_intervals_to_select=2, interval_width=2) >>> selector.fit(X, y).get_support() array([ True, True, False, False, True, True, False, False, False, False]) """
[docs] def __init__(self, n_intervals_to_select: int = 1, interval_width: Union[int, float] = None, pls: PLSRegression = None, n_cv_folds: int = 10, model_hyperparams: Union[Dict, List[Dict]] = None, n_jobs: int = 1): super().__init__(n_intervals_to_select, interval_width, model_hyperparams=model_hyperparams, n_cv_folds=n_cv_folds, n_jobs=n_jobs) self.pls = pls self.n_cv_folds = n_cv_folds
def _fit(self, X, y, n_intervals_to_select, interval_width): selection = np.ones(X.shape[1], dtype=bool) rank = np.ones(X.shape[1]) free_idx = [i for i in range(0, X.shape[1], interval_width)] n_intervals_to_remove = len(free_idx) - n_intervals_to_select with Parallel(n_jobs=self.n_jobs) as parallel: for n in range(len(free_idx) - n_intervals_to_select): x_free = X[:, selection] n_features = x_free.shape[1] evaluations = parallel(delayed(self.evaluate) (np.delete(x_free, np.r_[i:min(n_features, i + interval_width)], axis=1), y, self.pls) for i in range(0, n_features, interval_width)) scores, models = list(zip(*evaluations)) best = np.argmax(scores) worst_interval = free_idx[best] selection[worst_interval:worst_interval + interval_width] = 0 self.best_model_ = models[best] free_idx.remove(worst_interval) rank[worst_interval:worst_interval + interval_width] = n / n_intervals_to_remove self.support_ = selection self.rank_ = rank return self