import warnings
from typing import Union, Dict, List
import numpy as np
from joblib import Parallel, delayed
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted
from ._base import IntervalSelector
from ._base import FeatureDescriptor
[docs]class IPLS(IntervalSelector):
"""Interval selection with Interval Partial Least Squares (iPLS).
The optimal interval of a specified width is calculated according to Norgaard et al. [1]_.
Read more in the :ref:`User Guide <ipls>`.
Parameters
----------
interval_width : int, default=None
Width of the interval to select.
n_cv_folds : int, default=10
Number of cross validation folds used to evaluate intervals
pls : PLSRegression, default=None
Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this
to adjust the hyperparameters of the PLS method.
Attributes
----------
support_ : ndarray of shape (n_features,)
Mask of the selected interval.
score_ : float
Cross validation score of the interval selected.
References
----------
.. [1] L. Nogaard, A. Saudland, J. Wagner, J. P. Nielsen, L. Munck, S. B. Engelsen,
'Interval Partial Least-Squares Regression (iPLS):
A comparative chemometric study with an example from Near-Infrared Spectrocopy'
Applied Spectrosopy, Volume 54, Nr. 3, 413--419, 2000.
Examples
--------
>>> import numpy as np
>>> from auswahl import IPLS
>>> X = np.random.randn(100, 10)
>>> y = 5 * X[:, 3] - 2 * X[:, 4] # y only depends on two features
>>> selector = IPLS(interval_width=2)
>>> selector.fit(X, y).get_support()
array([False, False, False, True, True, False, False, False, False, False])
"""
[docs] def __init__(self,
n_intervals_to_select: int = 1,
interval_width: Union[int, float] = None,
n_cv_folds: int = 10,
pls: PLSRegression = None,
n_jobs: int = 1,
model_hyperparams: Union[Dict, List[Dict]] = None,
random_state: Union[int, np.random.RandomState] = None):
super().__init__(n_intervals_to_select=1, interval_width=interval_width,
model_hyperparams=model_hyperparams, n_cv_folds=n_cv_folds, n_jobs=n_jobs)
if n_intervals_to_select != 1:
warnings.warn("""IPLS only supports the selection of a single interval.
n_intervals_to_select has been clipped to 1 and the interval_width increased to
n_intervals_to_select * interval_width. Hence, IPLS models the special case of aranging
the selected intervals as continuum.""")
self.pls = pls
self.random_state = random_state
def _fit_ipls(self, X, y, interval_width, pls, start):
score, model = self.evaluate(X[:, np.arange(start, start + interval_width, dtype='int')],
y,
pls)
return score, model, start
def _fit(self, X, y, n_intervals_to_select, interval_width):
candidates = Parallel(n_jobs=self.n_jobs)(delayed(self._fit_ipls)(X,
y,
interval_width * n_intervals_to_select,
self.pls,
i) for i in range(X.shape[-1]-interval_width+1))
score, best_model, start = max(candidates, key=lambda x: x[0])
self.support_ = np.zeros(X.shape[1]).astype('bool')
self.support_[start:start + interval_width] = True
self.best_model_ = best_model
def reparameterize(self, feature_descriptor: FeatureDescriptor):
n_intervals_to_select, interval_width = feature_descriptor.get_configuration_for(self)
self.interval_width = n_intervals_to_select * interval_width