Source code for auswahl._vip

from typing import Union, List, Dict
from warnings import warn

import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted

from ._base import PointSelector, Convertible


[docs]class VIP(PointSelector, Convertible): """Feature Selection with Variable Importance in Projection. The VIP scores are computed according to Favilla et al. [1]_. Read more in the :ref:`User Guide <vip>`. Parameters ---------- n_features_to_select : int or float, default=None Number of features to select. pls : PLSRegression, default=None Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this to adjust the hyperparameters of the PLS method. Attributes ---------- pls_ : PLSRegression instance Fitted PLS estimator used to calculate the VIP scores. vips_ : ndarray of shape (n_features,) Calculated VIP scores. support_ : ndarray of shape (n_features,) Mask of selected features. References ---------- .. [1] Stefania Favilla, Caterina Durante, Mario Li Vigni, Marina Cocchi, 'Assessing feature relevance in NPLS models by VIP', Chemometrics and Intelligent Laboratory Systems, 129, 76--86, 2013. Examples -------- >>> import numpy as np >>> from auswahl import VIP >>> X = np.random.randn(100, 10) >>> y = 5 * X[:, 0] - 2 * X[:, 5] # y only depends on two features >>> selector = VIP(n_features_to_select=2) >>> selector.fit(X, y).get_support() array([ True, False, False, False, False, True, False, False, False, False]) """
[docs] def __init__(self, n_features_to_select: Union[int, float] = None, n_cv_folds: int = 5, pls: PLSRegression = None, model_hyperparams: Union[Dict, List[Dict]] = None): super().__init__(n_features_to_select, model_hyperparams, n_cv_folds) self.pls = pls
def _fit(self, X, y, n_features_to_select): _, model = self.evaluate(X, y, self.pls, do_cv=False) self.vips_ = self._calculate_vip_scores(X, model) selected_idx = np.argsort(self.vips_)[-n_features_to_select:] self.support_ = np.zeros(X.shape[1], dtype=bool) self.support_[selected_idx] = 1 _, self.best_model_ = self.evaluate(X[:, self.support_], y, self.pls, do_cv=False) return self def _calculate_vip_scores(self, X, model): x_scores = model.transform(X) x_weights = model.x_weights_ # already normalized y_loadings = model.y_loadings_ num_features = X.shape[1] explained_variance = (y_loadings ** 2) @ (x_scores.T @ x_scores) weighted_explained_variance = (x_weights ** 2) @ explained_variance.T vips = np.sqrt((num_features * weighted_explained_variance) / explained_variance.sum()) return vips.flatten()
[docs] def get_support_for_threshold(self, threshold: float = 1, indices: bool = False): """Select a set of features whose VIP values are above a given threshold. Parameters ---------- threshold : float, default=1 Lower bound that has to be exceeded by the VIP value of a feature so that it is selected. indices : bool, default=False If True, the return value will be an array of integers, rather than a boolean mask. Returns ------- selection : ndarray of shape (n_features,) Boolean mask of selected features, or array of indices if indices=True. """ check_is_fitted(self) mask = self.vips_ > threshold if not np.any(mask): warn(f'No VIP score is higher than the given threshold of {threshold}. ' f'Only the most important feature will be selected with a VIP value of {self.vips_.max()}') mask[np.argmax(self.vips_)] = 1 return mask if not indices else np.where(mask)[0]
[docs] def get_feature_scores(self): check_is_fitted(self) return self.vips_