Source code for auswahl.benchmarking.util.metrics

import numpy as np

from auswahl import FeatureDescriptor
from .data_handling import DataHandler
from abc import ABCMeta, abstractmethod


[docs]class StabilityScore(metaclass=ABCMeta): """ Base class for all stability scores useable by the benchmarking system Parameters ---------- metric_name: str, default=None Unique Name of the metric. If no name is provided, the name of the class inheriting from this function is used """
[docs] def __init__(self, metric_name: str): if metric_name is not None: self.metric_name = metric_name else: self.metric_name = type(self).__name__
def __call__(self, pod: DataHandler): self.add_stabilities(pod)
[docs] def add_stabilities(self, pod: DataHandler): """Conducts the evaluation of the stability metric across all datasets and methods in the :class:`~auswahl.benchmarking.DataHandler` object, which is extended with the results of the stability evaluation. Parameters ---------- pod: DataHandler instance of :class:`~auswahl.benchmarking.DataHandler` containing the results of the benchmarking procedure """ for n in pod.feature_descriptors: # FeatureDescriptor for method in pod.methods: for dataset in pod.datasets: # retrieve the samples of selected features (list of objects of type Selection) supports = pod.get_selection_data(method=method, n_features=n, dataset=dataset).to_numpy().tolist() supports = np.array([selection.features for selection in supports if selection.is_valid()]) stability = self.evaluate_stability(pod.get_meta(dataset), supports, n) if stability is not None: pod.register_stability(method=method, n_features=n, dataset=dataset, metric_name=self.metric_name, value=stability)
[docs] @abstractmethod def evaluate_stability(self, meta_data: dict, selections: np.array, features: FeatureDescriptor) -> float: """Conducts the stability evaluation of a set of executions of a selector algorithm on one dataset with a specific feature configuration under different data splits and seeds Parameters ---------- meta_data: dict information about the data set, which might be relevant for stability calculations. See :meth:`~auswahl.benchmarking.DataHandler.get_meta` for the contained data selections: np.ndarray The selected features of the different executions of the selector algorithm as integer indices of features. Shape (#executions, #features to select) features: FeatureDescriptor FeatureDescriptor describing the configuration of features to be selected Returns ------- stability: float """ ...
[docs]class PairwiseStabilityScore(StabilityScore, metaclass=ABCMeta): """ The class provides the infrastructure for the introduction of new symmetric and pairwise defined stability metrics. """ # go def _pairwise_scoring(self, meta_data: dict, selections: np.array, features: FeatureDescriptor): """The function handles the calculation of a pairwise stability assessment function all executions of a selector for a specific dataset and feature configuration Parameters ---------- meta_data: dict information about the data set, which might be relevant for stability calculations. See :meth:`auswahl.DataHandler.get_meta` for the contained data selections: np.ndarray The selected features of the different executions of the selector algorithm as integer indices of features. Shape (#executions, #features to select) features: FeatureDescriptor FeatureDescriptor describing the configuration of features to be selected Returns ------- stability: float """ # evaluate all different pairs (symmetry assumed) pairwise_sim = [] dim0, dim1 = np.triu_indices(selections.shape[0]) for i in range(dim0.size): if dim0[i] != dim1[i]: # only consider similarity between different pairs of feature sets pairwise_sim.append(self.pairwise_sim_func(meta_data, set_1=selections[dim0[i]], set_2=selections[dim1[i]])) if len(pairwise_sim) > 0: score = np.mean(np.array(pairwise_sim)) return score return None # go
[docs] def evaluate_stability(self, meta_data: dict, selections: np.array, features: FeatureDescriptor): return self._pairwise_scoring(meta_data, selections, features)
# go
[docs] @abstractmethod def pairwise_sim_func(self, meta_data: dict, set_1: np.ndarray, set_2: np.ndarray) -> float: """Function calculating the stability score for a single pair of selections of features. Parameters ---------- meta_data: dict Dict containing meta information about the dataset for which the stability metric is evaluated. See the documentation of :meth:`~auswahl.benchmarking.DataHandler.get_meta` for the available data. set_1: np.nadarray array of integer indices of selected features of shape (n_features_to_select,) set_2: np.nadarray array of integer indices of selected features of shape (n_features_to_select,) Returns ------- stability score for the given pair of selections: float """ ...
[docs]class DengScore(PairwiseStabilityScore): """Wraps the calculation of the selection stability score for randomized selection methods, according to Deng et al. [1]_. A detailed overview is provided in the user guide. Parameters ---------- metric_name: str, default="deng_score" Unique Name of the metric References ---------- .. [1] Bai-Chuan Deng, Yong-Huan Yun, Pan Ma, Chen-Chen Li, Da-Bing Ren and Yi-Zeng Liang, 'A new method for wavelength interval selection that intelligently optimizes the locations, widths and combination of intervals', Analyst, 6, 1876-1885, 2015. """
[docs] def __init__(self, metric_name: str = "deng_score"): super().__init__(metric_name)
[docs] def pairwise_sim_func(self, meta_data: dict, set_1: np.ndarray, set_2: np.ndarray) -> float: n_wavelengths = meta_data['n_features'] n = set_1.size e = n ** 2 / n_wavelengths return (np.intersect1d(set_1, set_2).size - e) / (n - e)
[docs]class ZucknickScore(PairwiseStabilityScore): """Wraps the calculation of the stability score according to Zucknick et al. [1]_. The stability score features a correlation-adjusting mechanism assessing stability not only with respect to set theoretical stabilities, but also according to the correlation between the features selected in different runs. A detailed overview is provided in the userguide. Parameters ---------- correlation_threshold: float, default=0.8 Parameter of the calculation of stability according to Zucknick et al. [1]_ . The parameter determines the minimum required correlation between two features to be considered similar. metric_name: str, default="zucknick_score" References ---------- .. [1] Zucknick, M., Richardson, S., Stronach, E.A.: Comparing the characteristics of gene expression profiles derived by univariate and multivariate classification methods. Stat. Appl. Genet. Molecular Biol. 7(1), 7 (2008) """
[docs] def __init__(self, correlation_threshold: float = 0.8, metric_name: str = "zucknick_score"): super().__init__(metric_name) if 0 <= correlation_threshold <= 1: self.correlation_threshold = correlation_threshold else: raise ValueError(f'Argument correlation_threshold is required to be in [0, 1]')
def _thresholded_correlation(self, spectra, support_1: np.array, support_2: np.array): set_diff = np.setdiff1d(support_2, support_1) if set_diff.size == 0: return 0 diff_features = np.transpose(spectra[:, set_diff]) # features x observations sup1_features = np.transpose(spectra[:, support_1]) correlation = np.abs(np.corrcoef(sup1_features, diff_features)) correlation = correlation * (correlation >= self.correlation_threshold) return (1 / support_2.size) * np.sum(correlation[:support_1.size, support_1.size:])
[docs] def pairwise_sim_func(self, meta_data: dict, set_1: np.ndarray, set_2: np.ndarray) -> float: n = set_1.size spectra = meta_data['x'] intersection_size = np.intersect1d(set_1, set_2).size union_size = 2 * n - intersection_size c_12 = self._thresholded_correlation(spectra, set_1, set_2) c_21 = self._thresholded_correlation(spectra, set_2, set_1) return (intersection_size + c_12 + c_21) / union_size