Source code for auswahl._mcuve

from typing import Union, List, Dict

import numpy as np
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils import check_random_state
from sklearn.utils.validation import check_is_fitted, check_scalar

from .util import get_coef_from_pls
from ._base import PointSelector


[docs]class MCUVE(PointSelector):
    """Feature selection with Monte Carlo Uninformative Variable Elimination.

    The stability for each feature is computed according to Cai et al. [1]_.

    Note that the **absolute** stability values are used to determine the most important features.

    Read more in the :ref:`User Guide <mcuve>`.

    Parameters
    ----------
    n_features_to_select : int or float, default=None
        Number of features to select.

    n_subsets : int, default=100
        Number of random subsets to create.

    n_samples_per_subset : int or float, default=None
        Number of samples used for each random subset.

    pls : PLSRegression, default=None
        Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this
        to adjust the hyperparameters of the PLS method.

    random_state : int or numpy.random.RandomState, default=None
        Seed for the random subset sampling. Pass an int for reproducible output across function calls.

    Attributes
    ----------
    coefs_ : ndarray of shape (n_subsets, n_features)
        Fitted regression coefficients of the <n_subsets> PLS models.

    stability_ : ndarray of shape (n_features,)
        Computed stability value for each feature. While these attribute contains the signed stability values, MC-UVE
        uses the absolute values to select the most important features.

    support_ : ndarray of shape (n_features,)
        Mask of selected features.

    References
    ----------
    .. [1] Wensheng Cai, Yankun Li and Xueguang Shao,
           'A variable selection method based on uninformative variable elimination for
           multivariate calibration of near-infrared spectra',
           Chemometrics and Intelligent Laboratory Systems, 90, 188-194, 2008.

    Examples
    --------
    >>> import numpy as np
    >>> from auswahl import MCUVE
    >>> X = np.random.randn(100, 10)
    >>> y = 5 * X[:, 0] - 2 * X[:, 5]  # y only depends on two features
    >>> selector = MCUVE(n_features_to_select=2)
    >>> selector.fit(X, y).get_support()
    array([ True, False, False, False, False, True, False, False, False, False])
    """

[docs]    def __init__(self,
                 n_features_to_select: Union[int, float] = None,
                 n_subsets: int = 100,
                 n_samples_per_subset: Union[int, float] = None,
                 pls: PLSRegression = None,
                 n_cv_folds: int = 5,
                 model_hyperparams: Union[Dict, List[Dict]] = None,
                 random_state: Union[int, np.random.RandomState] = None):
        super().__init__(n_features_to_select, model_hyperparams, n_cv_folds, random_state)
        self.n_subsets = n_subsets
        self.n_samples_per_subset = n_samples_per_subset
        self.pls = pls

    def _fit(self, X, y, n_features_to_select):
        _, model = self.evaluate(X, y, self.pls, do_cv=False)
        random_state = check_random_state(self.random_state)
        self._check_n_subsets()
        n_samples_per_subset = self._check_n_samples_per_subset(X)

        n_samples = X.shape[0]
        coefs = []
        for i in range(self.n_subsets):
            idx = random_state.permutation(n_samples)[:n_samples_per_subset]
            X_i, y_i = X[idx], y[idx]

            model.fit(X_i, y_i)
            coefs.append(get_coef_from_pls(model).squeeze())

        self.coefs_ = np.array(coefs)
        self.stability_ = self.coefs_.mean(axis=0) / self.coefs_.std(axis=0)

        selected_idx = np.argsort(abs(self.stability_))[-n_features_to_select:]
        self.support_ = np.zeros(X.shape[1], dtype=bool)
        self.support_[selected_idx] = 1
        _, self.best_model_ = self.evaluate(X[:, self.support_], y, self.pls, do_cv=False)

        return self

    def _check_n_subsets(self):
        check_scalar(x=self.n_subsets, name='n_subsets', target_type=int, min_val=2)

    def _check_n_samples_per_subset(self, X):
        n_samples = X.shape[0]
        n_samples_per_subset = self.n_samples_per_subset

        if n_samples_per_subset is None:
            n_samples_per_subset = n_samples // 2
        elif 0 < n_samples_per_subset < 1:
            n_samples_per_subset = max(1, int(n_samples_per_subset * n_samples))

        if (n_samples_per_subset <= 0) or (n_samples_per_subset >= n_samples):
            raise ValueError('n_samples_per_subset has to be either an int in {1, ..., n_samples-1}'
                             f'or a float in (0, 1); got {self.n_samples_per_subset}')

        return n_samples_per_subset