Source code for auswahl._fipls

from typing import Union, List, Dict

import numpy as np
from joblib import Parallel, delayed
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted

from ._base import IntervalSelector


[docs]class FiPLS(IntervalSelector):
    """Feature Selection with Forward interval Partial Least Squares (FiPLS).

    The FiPLS method has been described in Xiaobo et al. [1]_.
    This implementation deviates from the original description as it allows to select intervals at arbitrary positions.

    Read more in the :ref:`User Guide <fipls>`.

    Parameters
    ----------
    n_intervals_to_select : int, default=None
        Number of intervals to select.

    interval_width : int or float, default=None
        Number of features that form an interval

    pls : PLSRegression, default=None
        Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this
        to adjust the hyperparameters of the PLS method.

    n_cv_folds : int, default=10
        Number of cross validation folds used to evaluate intervals

    n_jobs : int, default=1
        Number of parallel processes that fit PLS models on the different intervals

    Attributes
    ----------
    support_ : ndarray of shape (n_features,)
        Mask of selected features.

    References
    ----------
    .. [1] Zou Xiaobo, Zhao Jiewen, Li Yanxiao,
           'Selection of the efficient wavelength regions in FT-NIR spectroscopy for determination of SSC of ‘Fuji’
           apple based on BiPLS and FiPLS models',
           Vibrational Spectroscopy, vol. 44, no. 2, 220--227, 2007.

    Examples
    --------
    >>> import numpy as np
    >>> from auswahl import FiPLS
    >>> X = np.random.randn(100, 10)
    >>> y = 5 * X[:, 0] - 4 * X[:,1] - 2 * X[:, 5] + 3 * X[:,6]  # y depends on two intervals
    >>> selector = FiPLS(n_intervals_to_select=2, interval_width=2)
    >>> selector.fit(X, y).get_support()
    array([ True, True, False, False, False, True, True, False, False, False])
    """

[docs]    def __init__(self,
                 n_intervals_to_select: int = 1,
                 interval_width: Union[int, float] = None,
                 pls: PLSRegression = None,
                 n_cv_folds: int = 10,
                 model_hyperparams: Union[Dict, List[Dict]] = None,
                 n_jobs: int = 1):
        super().__init__(n_intervals_to_select, interval_width,
                         n_cv_folds=n_cv_folds, model_hyperparams=model_hyperparams, n_jobs=n_jobs)
        self.pls = pls
        self.n_cv_folds = n_cv_folds

    def _fit(self, X, y, n_intervals_to_select, interval_width):
        selection = np.zeros(X.shape[1], dtype=bool)

        with Parallel(n_jobs=self.n_jobs) as parallel:
            for n in range(n_intervals_to_select):
                x_selected = X[:, selection]
                x_free = X[:, ~selection]
                free_idx = np.arange(X.shape[1])[~selection]
                evaluations = parallel(delayed(self.evaluate)
                                  (np.concatenate([x_selected, x_free[:, i:i + interval_width]], axis=1), y, self.pls)
                                  for i in range(len(free_idx) - interval_width + 1))
                scores, models = list(zip(*evaluations))
                best_idx = np.argmax(scores)
                selection[free_idx[best_idx]:free_idx[best_idx + interval_width - 1] + 1] = 1
                self.best_model_ = models[best_idx]

        self.support_ = selection
        return self