Source code for auswahl.benchmarking.util.data_handling

import os
import pickle
from functools import wraps
from functools import partial as prtl
from typing import List, Union, Tuple

import numpy as np
import pandas as pd

from .helpers import Selection
from ..._base import FeatureDescriptor


def _identify_key_error(key, multiindex):
    for level, k in enumerate(key):
        if isinstance(k, slice):
            continue
        else:
            if not isinstance(k, list):
                k = [k]
            for item in k:
                res = multiindex.isin([item], level=level)
                if not np.any(res):
                    return f'Item {item if not isinstance(item, FeatureDescriptor) else item.org_key} ' \
                           f'not present in level {multiindex.names[level]}'
    return None


def _protected(func):
    @wraps(func)
    def _wrapper(s, *args, **kwargs):
        try:
            res, key, multiindex = func(s, *args, **kwargs)
        except KeyError as e:
            # tentative measure for future pandas versions
            raise KeyError() from e
        # current version of pandas requires that
        key_error = _identify_key_error(key, multiindex)
        if key_error is not None:
            raise KeyError(key_error)
        return res

    return _wrapper


[docs]class DataHandler: """Data handling class corralling data generated by the benchmarking of different wavelength selection methods. Parameters ---------- datasets: List[str] list of dataset identifiers to be allocated in the DataHandler methods: List[str] list of selector identifiers to be allocated in the DataHandler feature: List[FeatureDescriptor] list of FeatureDescriptors to be allocated in the DataHandler reg_metrics: List[str] list of regression metrics to be allocated in the DataHandler stab_metrics: List[str] list of stability metrics to be allocated in the DataHandler n_runs: int number of evaluation run for all selectors (for every dataset and feature configuration) to be allocated in the DataHandler Attributes ---------- datasets: List[str] sorted list of dataset identifiers contained in the DataHandler methods: List[str] sorted list of selector identifiers contained in the DataHandler feature_descriptors: List[FeatureDescriptor] sorted list of FeatureDescriptors contained in the DataHandler reg_metrics: List[str] sorted list of regression metrics contained in the DataHandler stab_metrics: List[str] sorted list of stability metrics contained in the DataHandler n_runs: int number of evaluation run for all selectors (for every dataset and feature configuration) n_datasets: int number of datasets contained in the DataHandler reg_data: pandas.DataFrame data frame holding regression data stab_data: pandas.DataFrame data frame holding the stability data measurement_data: pandas.DataFrame data frame holding the execution time measurement data selecton_data: pandas.DataFrame data frame holding the feature selection data """
[docs] def __init__(self, datasets: List[str], methods: List[str], features: List[FeatureDescriptor], reg_metrics: List[str], stab_metrics: List[str], n_runs: int): self.datasets = sorted(datasets) self.methods = sorted(methods) self.feature_descriptors = sorted(features) self.reg_metrics = sorted(reg_metrics) self.stab_metrics = sorted(stab_metrics) self.n_runs = n_runs self.n_datasets = len(datasets) # set up the multiindex dataframes self.reg_data = self._build_indices(levels=[datasets, features, reg_metrics, [i for i in range(n_runs)]], level_names=['dataset', 'n_features', 'regression_metric', 'run'], index=methods) self.stab_data = self._build_indices(levels=[datasets, features, stab_metrics], level_names=['dataset', 'n_features', 'stability_metric'], index=methods) self.measurement_data = self._build_indices(levels=[datasets, features, [i for i in range(n_runs)]], level_names=['dataset', 'n_features', 'run'], index=methods) self.selection_data = self._build_indices(levels=[datasets, features, [i for i in range(n_runs)]], level_names=['dataset', 'n_features', 'run'], index=methods, initer=lambda ind, col: [[Selection() for _ in range(col.shape[0])] for _ in ind]) # store, if interval descriptors passed to the member functions of this class should be resolved self.resolve_tuples = features[0].resolve_tuples self.meta = dict()
def _build_indices(self, levels, level_names, index, initer=lambda ind, col: np.NaN * np.zeros((len(ind), col.shape[0]), dtype='float')): cols = pd.MultiIndex.from_product(levels, names=level_names) cols, _ = cols.sortlevel(level=0) data = pd.DataFrame(initer(index, cols), index=index, columns=cols) return data def _feature_descriptor_conversion(self, features): return list(map(prtl(FeatureDescriptor, resolve_intervals=self.resolve_tuples), features)) return [FeatureDescriptor(feature, resolve_intervals=self.resolve_tuples) for feature in features] def _make_key(self, dataset: Union[str, List[str]] = None, method: Union[str, List[str]] = None, n_features: Union[int, List[int]] = None, **kwargs): # row index key: method_key = method if method is not None else slice(None) if n_features is not None and not isinstance(n_features, list): n_features = [n_features] # column index key: key = [dataset if dataset is not None else slice(None), self._feature_descriptor_conversion(n_features) if n_features is not None else slice(None)] for key_item in ['reg_metric', 'stab_metric', 'sample']: value = kwargs.pop(key_item, -1) if value != -1: key.append(value if value is not None else slice(None)) return method_key, tuple(key) # go def _register_regression(self, value, dataset: str = None, method: str = None, n_features: FeatureDescriptor = None, reg_metric: str = None, sample: int = None): self.reg_data.loc[(method, (dataset, n_features, reg_metric, sample))] = value # go def _register_selection(self, dataset: str, method: str, n_features: FeatureDescriptor, sample: int, selection: list): self.selection_data.loc[(method, (dataset, n_features, sample))].features = selection # go
[docs] def register_stability(self, dataset: str, method: str, n_features: FeatureDescriptor, metric_name: str, value: float): """ Register a stability score in the DataHandler Parameters ---------- dataset: str identifier of the dataset for which a stability is registered method: str identifer of the selector for which the stability is registered n_features: Union[int, Tuple[int, int], FeatureDescriptor] feature configuration for which the stability is registered metric_name: name of the stability metric registered value: the calculated stability """ self.stab_data.loc[(method, (dataset, FeatureDescriptor(n_features), metric_name))] = value
# go def _register_measurement(self, value, dataset: str = None, method: str = None, n_features: FeatureDescriptor = None, sample: int = None): self.measurement_data.loc[(method, (dataset, n_features, sample))] = value
[docs] def register_meta(self, dataset_meta: List[Tuple[np.array, np.array, str, float]]): """Register dataset information into the DataHandler Parameters ---------- dataset_meta: List[Tuple[np.array, np.array, str, float]] List of tuples specifying the spectral data of data set, its target values, its name and its training data ratio """ if not isinstance(dataset_meta, list): dataset_meta = [dataset_meta] for x, y, name, _ in dataset_meta: self.meta[name] = {'x': x, 'y': y, 'n_samples': x.shape[0], 'n_features': x.shape[1]}
[docs] def get_meta(self, dataset): """Provides meta information for each dataset. Parameters ---------- dataset: str Name of the dataset, whose meta information is requested. Returns ------- dict containing information about the dataset ``x`` The spectral data of the dataset: np.ndarray of shape (n_samples, n_wavelengths) ``y`` The target quantity of the dataset: np.ndarray of shape (n_samples, ) ``n_samples`` Direct access to the number of samples in the dataset ``n_features`` Direct access to the number of wavelengths, that is features, in the dataset """ return self.meta[dataset]
[docs] @_protected def get_regression_data(self, dataset: Union[str, List[str]] = None, method: Union[str, List[str]] = None, n_features: Union[int, List[int], Tuple[int], List[Tuple[int]]] = None, reg_metric: Union[str, List[str]] = None, sample: Union[int, List[int]] = None) -> pd.DataFrame: """Retrieve data related to the regression performance of feature selection methods. Parameters ---------- dataset: str or list of str, default=None Dataset identifier or list of dataset identifiers. method : str or list of str, default=None Method(s) to be retrieved. If None, all methods are retrieved. n_features : int, tuple of int, list of int or list of tuple of int, default=None Feature configuration for which to retrieve results. A configuration for a single number of features, a single interval defined as tuple (#intervals, interval_width) or lists of such configurations can be passed. If None, the runs for all feature configurations are retrieved. reg_metric : str or list of str, default=None Regression metric(s) to be retrieved. If None, all available metrics are retrieved. item : Literal of ['mean', 'std', 'median', 'max', 'min', 'samples'], default=None Specify, which indicator(s) for the selected regression metrics is to be retrieved. If None, all indicators are retrieved. Returns ------- pandas multiIndex DataFrame. The frame holds the selection methods in its index and a multiindex with levels {'dataset', 'n_features', 'reg_metric', 'sample'} as columns, where 'sample' refers to the individual runs for the statistical evaluation. The keys for level 'n_features' are of type :class:`~auswahl.FeatureDescriptor`. """ method_key, key = self._make_key(dataset, method, n_features, reg_metric=reg_metric, sample=sample) return self.reg_data.loc[(method_key, key)], key, self.reg_data.columns
[docs] @_protected def get_selection_data(self, dataset: Union[str, List[str]] = None, method: Union[str, List[str]] = None, n_features: Union[int, Tuple[int], List[int], List[Tuple[int]]] = None, sample: Union[int, List[int]] = None) -> pd.DataFrame: """Retrieve data related to the regression performance of feature selection methods. Parameters ---------- method : str or list of str, default=None Method(s) to be retrieved. If None, all methods are retrieved. n_features : int, tuple of int, list of int or list of tuple of int, default=None Feature configuration for which to retrieve results. A configuration for a single number of features, a single interval defined as tuple (#intervals, interval_width) or lists of such configurations can be passed. If None, the runs for all numbers of selected features are retrieved. sample_run : int or list of int, default=None The run(s) for which the selected features are to be retrieved. If None, the selected features of all runs are retrieved. Returns ------- pandas.MultiIndex DataFrame. The frame holds the methods in its index and a multiindex with levels {'dataset', 'n_features', 'sample'} as columns, where 'sample' refers to the individual runs for the statistical evaluation. The keys for level 'n_features' are of type :class:`~auswahl.FeatureDescriptor`. The type of the data in the frame is :class:`~auswahl.benchmarking.util.helpers.Selection`. """ method_key, key = self._make_key(dataset, method, n_features, sample=sample) return self.selection_data.loc[(method_key, key)], key, self.selection_data.columns
[docs] @_protected def get_stability_data(self, dataset: Union[str, List[str]] = None, method: Union[str, List[str]] = None, n_features: Union[int, Tuple[int], List[int], List[Tuple[int]]] = None, stab_metric: Union[str, List[str]] = None) -> pd.DataFrame: """Retrieve data related to the stability of feature selection methods. Parameters ---------- method : str or list of str, default=None Method(s) to be retrieved. If None, all methods are retrieved. n_features : int, tuple of int, list of int or list of tuple of int, default=None Feature configuration for which to retrieve results. A configuration for a single number of features, a single interval defined as tuple (#intervals, interval_width) or lists of such configurations can be passed. If None, the runs for all numbers of selected features are retrieved. stab_metric : str or list of str, default=None Stability metric(s) to be retrieved. If None, all available metrics are retrieved. Returns ------- pandas multiIndex DataFrame. The frame holds the selection methods in its index and a multiindex with levels {'dataset', 'n_features', 'stab_metric'} as columns. The keys for level 'n_features' are of type :class:`~auswahl.FeatureDescriptor`. """ method_key, key = self._make_key(dataset, method, n_features, stab_metric=stab_metric) return self.stab_data.loc[(method_key, key)], key, self.stab_data.columns
[docs] @_protected def get_measurement_data(self, dataset: Union[str, List[str]] = None, method: Union[str, List[str]] = None, n_features: Union[int, List[int], Tuple[int], List[Tuple[int]]] = None, sample: Union[int, List[int]] = None): """Retrieve data related to the stability of feature selection methods. Parameters ---------- method : str or list of str, default=None Method(s) to be retrieved. If None, all methods are retrieved. n_features : int, tuple of int, list of int or list of tuple of int, default=None Feature configuration for which to retrieve results. A configuration for a single number of features, a single interval defined as tuple (#intervals, interval_width) or lists of such configurations can be passed. If None, the runs for all numbers of selected features are retrieved. sample_run : int or list of int, default=None The run(s) for which the selected features are to be retrieved. If None, the selected features of all runs are retrieved. Returns ------- pandas multiIndex DataFrame. The frame holds the methods in its index and a multiindex with levels {'dataset', 'n_features', 'sample'} as columns, where 'sample' refers to the individual runs for the statistical evaluation. The keys for level 'n_features' are of type :class:`~auswahl.FeatureDescriptor`. """ method_key, key = self._make_key(dataset, method, n_features, sample=sample) return self.measurement_data.loc[(method_key, key)], key, self.measurement_data.columns
[docs] def store(self, file_path: str, file_name: str): """Stores the DataHandler object as pickle file. Parameters ---------- file_path: str Path to the file. file_name: str Name of the file without extension. """ if '.' in file_name: file_name = file_name.split('.')[0] path = os.path.join(file_path, f'{file_name}.pickle') with open(path, 'wb') as file: pickle.dump(self, file)