Source code for auswahl._spa

from typing import Union, List, Dict

import numpy as np
from joblib import Parallel, delayed
from sklearn.cross_decomposition import PLSRegression
from sklearn.utils.validation import check_is_fitted

from ._base import PointSelector


[docs]class SPA(PointSelector): """Feature selection with the Successive Projection Algorithm (SPA). The Successive Projections Algorithm conducts feature selection according to Araújo et al. [1]_. The algorithm aims to find a set of features exhibiting minimal collinearity. Read more in the :ref:`User Guide <spa>`. Parameters ---------- n_features_to_select : int, default=None Upper bound of features to select. n_cv_folds : int, default=5 Number of cross validation folds used in the evaluation of feature sets. pls : PLSRegression, default=None Estimator instance of the :py:class:`PLSRegression <sklearn.cross_decomposition.PLSRegression>` class. Use this to adjust the hyperparameters of the PLS method. n_jobs : int, default=1 Number of jobs used for parallel calculation of SPA Attributes ---------- support_ : ndarray fo shape (n_features,) Mask of selected features References ---------- .. [1] Mário César Ugulino Araújo,Teresa Cristina Bezerra Saldanha, Roberto Kawakami Harrop Galvao, Takashi Yoneyama, Henrique Caldas Chame and Valeria Visani, The successive projections algorithm for variable selection in spectroscopic multicomponent analysis, Chemometrics and Intelligent Laboratory Systems, 57, 65-73, 2001 Examples -------- >>> import numpy as np >>> from auswahl import SPA >>> np.random.seed(1337) >>> X = np.random.randn(1000, 10) >>> y = 5 * X[:, 0] - 2 * X[:, 5] # y only depends on two features >>> selector = SPA(n_features_to_select=2) >>> selector.fit(X, y).get_support() array([ True, False, False, False, False, True, False, False, False, False]) """
[docs] def __init__(self, n_features_to_select: int = None, n_cv_folds: int = 5, pls: PLSRegression = None, n_jobs: int = 1, model_hyperparams: Union[Dict, List[Dict]] = None): super().__init__(n_features_to_select, model_hyperparams, n_cv_folds, n_jobs=n_jobs) self.pls = pls
def _fit_spa(self, X, y, n_features_to_select, pls, seed): wavelengths = [seed] current = X[:, seed:seed + 1] rest = np.delete(X, seed, 1) wavelength_map = np.arange(X.shape[1]) wavelength_map = np.delete(wavelength_map, seed) for j in range(n_features_to_select - 1): current = current / np.linalg.norm(current, ord=2) projections = rest - current @ np.transpose(np.transpose(rest) @ current) projection_distances = np.linalg.norm(projections, ord=2, axis=0) next_index = np.argmax(projection_distances) current = projections[:, next_index:next_index + 1] rest = np.delete(projections, next_index, 1) wavelengths.append(wavelength_map[next_index]) wavelength_map = np.delete(wavelength_map, next_index) score, model = self.evaluate(X[:, wavelengths], y, self.pls) return score, model, wavelengths def _fit(self, X, y, n_features_to_select): candidates = Parallel(n_jobs=self.n_jobs)(delayed(self._fit_spa)(X, y, n_features_to_select, self.pls, i) for i in range(X.shape[-1])) score, model, opt_set = max(candidates, key=lambda x: x[0]) self.support_ = np.zeros(X.shape[1]).astype('bool') self.support_[opt_set] = True self.best_model_ = model