Source code for auswahl._vip

from typing import Union, List, Dict
from warnings import warn

import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted

from ._base import PointSelector, Convertible


[docs]class VIP(PointSelector, Convertible):
    """Feature Selection with Variable Importance in Projection.

    The VIP scores are computed according to Favilla et al. [1]_.

    Read more in the :ref:`User Guide <vip>`.

    Parameters
    ----------
    n_features_to_select : int or float, default=None
        Number of features to select.

    pls : PLSRegression, default=None
        Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this
        to adjust the hyperparameters of the PLS method.

    Attributes
    ----------
    pls_ : PLSRegression instance
        Fitted PLS estimator used to calculate the VIP scores.

    vips_ : ndarray of shape (n_features,)
        Calculated VIP scores.

    support_ : ndarray of shape (n_features,)
        Mask of selected features.

    References
    ----------
    .. [1] Stefania Favilla, Caterina Durante, Mario Li Vigni, Marina Cocchi,
           'Assessing feature relevance in NPLS models by VIP',
           Chemometrics and Intelligent Laboratory Systems, 129, 76--86, 2013.

    Examples
    --------
    >>> import numpy as np
    >>> from auswahl import VIP
    >>> X = np.random.randn(100, 10)
    >>> y = 5 * X[:, 0] - 2 * X[:, 5]  # y only depends on two features
    >>> selector = VIP(n_features_to_select=2)
    >>> selector.fit(X, y).get_support()
    array([ True, False, False, False, False, True, False, False, False, False])
    """

[docs]    def __init__(self,
                 n_features_to_select: Union[int, float] = None,
                 n_cv_folds: int = 5,
                 pls: PLSRegression = None,
                 model_hyperparams: Union[Dict, List[Dict]] = None):
        super().__init__(n_features_to_select, model_hyperparams, n_cv_folds)
        self.pls = pls

    def _fit(self, X, y, n_features_to_select):
        _, model = self.evaluate(X, y, self.pls, do_cv=False)
        self.vips_ = self._calculate_vip_scores(X, model)

        selected_idx = np.argsort(self.vips_)[-n_features_to_select:]
        self.support_ = np.zeros(X.shape[1], dtype=bool)
        self.support_[selected_idx] = 1
        _, self.best_model_ = self.evaluate(X[:, self.support_], y, self.pls, do_cv=False)

        return self

    def _calculate_vip_scores(self, X, model):
        x_scores = model.transform(X)
        x_weights = model.x_weights_  # already normalized
        y_loadings = model.y_loadings_

        num_features = X.shape[1]
        explained_variance = (y_loadings ** 2) @ (x_scores.T @ x_scores)
        weighted_explained_variance = (x_weights ** 2) @ explained_variance.T
        vips = np.sqrt((num_features * weighted_explained_variance) / explained_variance.sum())

        return vips.flatten()

[docs]    def get_support_for_threshold(self, threshold: float = 1, indices: bool = False):
        """Select a set of features whose VIP values are above a given threshold.

        Parameters
        ----------
        threshold : float, default=1
            Lower bound that has to be exceeded by the VIP value of a feature so that it is selected.

        indices : bool, default=False
            If True, the return value will be an array of integers, rather than a boolean mask.

        Returns
        -------
        selection : ndarray of shape (n_features,)
            Boolean mask of selected features, or array of indices if indices=True.
        """
        check_is_fitted(self)
        mask = self.vips_ > threshold
        if not np.any(mask):
            warn(f'No VIP score is higher than the given threshold of {threshold}. '
                 f'Only the most important feature will be selected with a VIP value of {self.vips_.max()}')
            mask[np.argmax(self.vips_)] = 1

        return mask if not indices else np.where(mask)[0]

[docs]    def get_feature_scores(self):
        check_is_fitted(self)
        return self.vips_