import warnings
from typing import List, Union, Literal, Tuple
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from .data_handling import DataHandler
from ..._base import FeatureDescriptor
# go
def _check_specified_or_singleton(pool, argument, identifier):
if argument is None:
if len(pool) > 1:
raise ValueError(f'{identifier} is ambiguous. Specify a {identifier}.')
elif len(pool) == 0:
raise ValueError(f'No {identifier} specified during configuration of the benchmarking.')
return pool[0]
elif isinstance(argument, list):
if len(argument) == 1:
return argument[0]
raise ValueError(f'A single specifier is required for {identifier}')
return argument
def _check_n_features(pod: DataHandler, n_features):
if n_features is not None:
if n_features is not isinstance(n_features, list):
n_features = [n_features]
n_features = [FeatureDescriptor(feature, pod.resolve_tuples) for feature in n_features]
else:
n_features = pod.feature_descriptors
return n_features
# go
def _arrange_boxes(pod, n_features, methods):
x_coords = []
ticks = np.arange(len(n_features) if n_features is not None else len(pod.feature_descriptors)) + 1 # start with 1
n_methods = len(methods if methods is not None else pod.methods)
if len(methods) > 1:
for i in range(n_methods):
x_coords.append((-0.15 + ticks + (0.3 / (n_methods - 1)) * i).tolist())
else:
x_coords = [ticks]
return x_coords, ticks
# go
def _box_plot(title: str,
x_label: str,
y_label: str,
y_data: List[List[float]],
x_data: List[float],
legend: List[str],
tick_labels: List[List[Union[float, int]]] = None,
ticks: List[Union[int, float]] = None,
save_path: str = None):
colors = plt.cm.get_cmap('Accent', len(y_data) + 1)
entities = ['boxprops', 'flierprops', 'capprops', 'whiskerprops']
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
legend_handles = []
# adapt box width to the offsetting and the number of methods if a custom ticking is used
if tick_labels is not None:
box_width = (0.3 * 0.9) / len(y_data)
else:
pos = [data[0] for data in x_data]
box_width = (max(pos) - min(pos)) / 20
if legend is None:
plotting_kwargs = dict()
for entity in entities:
plotting_kwargs[entity] = dict(color='k')
box_subplots = []
for i, data in enumerate(y_data):
if legend is not None:
plotting_kwargs = dict()
for entity in entities:
plotting_kwargs[entity] = dict(color=colors(i))
box_subplots.append(ax.boxplot(data, positions=x_data[i], whis=(0, 100), widths=box_width,
manage_ticks=False, patch_artist=True, **plotting_kwargs))
if legend is not None:
legend_handles.append(mpatches.Patch(color=colors(i), label=legend[i]))
# fill boxes
for i, box_plot in enumerate(box_subplots):
for patch in box_plot['boxes']:
patch.set_facecolor(colors(i))
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.grid(axis='y')
if ticks is not None: # apply custom ticking and labelling
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
ax.set_xticklabels(tick_labels)
ax.set_xticks(ticks)
if legend is not None:
ax.legend(handles=legend_handles)
if save_path is not None:
plt.savefig(save_path)
else:
plt.show()
# go
def _errorbar_plot(title: str,
x_label: str,
y_label: str,
y_data: np.array,
y_max: np.array,
y_min: np.array,
x_data: List[List[Union[float, int]]],
tick_labels: List[List[Union[float, int]]],
ticks: List[Union[int, float]],
legend: List[str],
plot_lines: bool = True,
save_path: str = None):
colors = plt.cm.get_cmap('Accent', y_data.shape[0] + 1)
markers = [c for c in ".ov^<>12348sp*hH+xDd|"]
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
legend_handles = []
# calculate errors:
y_max = (y_max - y_data).tolist()
y_min = (y_data - y_min).tolist()
y_data = y_data.tolist()
for i, y in enumerate(y_data):
ax.errorbar(x_data[i] if len(x_data) > 1 else x_data[0],
y,
yerr=[y_min[i], y_max[i]],
color=colors(i),
marker=markers[i],
linestyle='dotted' if plot_lines else 'none')
legend_handles.append(mpatches.Patch(color=colors(i), label=legend[i]))
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_xticklabels(tick_labels)
ax.set_xticks(ticks)
ax.legend(handles=legend_handles)
ax.grid(axis='y')
if save_path is not None:
plt.savefig(save_path)
else:
plt.show()
def _line_plot(title: str,
x_label: str,
y_label: str,
y_data: List[List[float]],
x_data: List[Union[int, Tuple[int, int]]],
legend: List[str],
save_path: str = None):
colors = plt.cm.get_cmap('Accent', len(y_data) + 1)
markers = [c for c in ".ov^<>12348sp*hH+xDd|"]
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 5))
legend_handles = []
positions = np.arange(len(x_data))
for i, y_data in enumerate(y_data):
ax.errorbar(positions,
y_data,
color=colors(i),
marker=markers[i])
legend_handles.append(mpatches.Patch(color=colors(i), label=legend[i]))
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
ax.set_title(title)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_xticks(positions)
ax.set_xticklabels(x_data)
ax.legend(handles=legend_handles)
ax.grid(axis='y')
if save_path is not None:
plt.savefig(save_path)
else:
plt.show()
# go -> confirmed
[docs]def plot_score_vs_stability(pod: DataHandler,
n_features: Union[int, Tuple[int]] = None,
dataset: str = None,
stability_metric: str = None,
regression_metric: str = None,
methods: Union[str, List[str]] = None,
save_path: str = None):
"""Plotting a boxplot for the benchmarked methods displaying
* the mean regression score on the y-axis
* mean regression value
* (25,75) IQR as box
* (0, 100) range as whiskers
* the stability score on the x-axis
Parameters
----------
pod : DataHandler
:class:`~auswahl.benchmarking.DataHandler` object containing the benchmarking data.
dataset: str
Dataset for which the data is to be plotted. If there is data for only one dataset
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
n_features: int or tuple of int
Number of features, which were to be selected by the algorithms. If there is data for only one feature configuration
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
stability_metric : str
Identifier of the stability metric to be plotted in the pod. If there is data for only one stability metric
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
regression_metric : str
Identifier of the regression metric to be plotted in the pod. If there is data for only one regression metric
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
methods : str or list of str, default=None
Identifiers of methods for which the data is to be plotted. If None, all available methods are plotted.
save_path: str, default=None
Path at which the plot is stored. If None, the plot is just displayed.
"""
dataset = _check_specified_or_singleton(pod.datasets, dataset, identifier='dataset')
n_features = _check_specified_or_singleton(pod.feature_descriptors, n_features, identifier='n_features')
regression_metric = _check_specified_or_singleton(pod.reg_metrics, regression_metric,
identifier='regression metric')
stability_metric = _check_specified_or_singleton(pod.stab_metrics, stability_metric, identifier='stability metric')
reg_data = pod.get_regression_data(dataset=dataset,
method=methods,
n_features=n_features,
reg_metric=regression_metric).to_numpy().tolist()
stab_data = pod.get_stability_data(dataset=dataset,
method=methods,
n_features=n_features,
stab_metric=stability_metric).to_numpy().tolist()
_box_plot(f'Regression-Stability-Plot for {FeatureDescriptor(n_features)} features',
stability_metric,
regression_metric,
reg_data,
stab_data,
pod.methods,
save_path=save_path)
# go -> confirmed
[docs]def plot_exec_time(pod: DataHandler,
dataset: str = None,
methods: Union[str, List[str]] = None,
n_features: List[Union[int, Tuple[int]]] = None,
item: Literal['mean', 'median'] = 'mean',
save_path: str = None):
"""Plots execution times of selectors across different number of features to be selected.
Parameters
----------
pod: DataHandler
:class:`~auswahl.benchmarking.DataHandler` object containing the benchmarking data.
dataset: str, default=None
Identifier of the dataset of which to plot the execution time. If there is data for only one dataset
in the BenchmarkPOD object, the argument does not have to be specified.
methods: str or list of str, default=None
Identifiers of methods for which to plot the execution time. If None, all available methods are used.
n_features: list of integers or of tuples of integers, default=None
Identifiers of the number of features or the configuration of intervals for which the execution time is to be
plotted. If None, all available feature descriptors are used.
item: Literal['mean', 'median'], default='mean'
Specifies whether the mean or median is displayed in the plot.
save_path: str
Path at which the plot has to be saved.
"""
dataset = _check_specified_or_singleton(pod.datasets, dataset, identifier='dataset')
n_features = _check_n_features(pod, n_features)
if methods is None:
methods = pod.methods
exec_times = pod.get_measurement_data(dataset=dataset,
method=methods,
n_features=n_features)
grouped = exec_times.groupby(axis=1, level=['dataset', 'n_features'])
exec_mins = grouped.min().to_numpy()
exec_max = grouped.max().to_numpy()
if item == 'mean':
exec_times = grouped.nanmean().to_numpy()
elif item == 'median':
exec_times = grouped.nanmedian().to_numpy()
else:
raise ValueError("f'Unknown item {item}. Use median or mean'")
x_coords, ticks = _arrange_boxes(pod, n_features, methods)
_errorbar_plot(f'Execution time: {item} and ranges',
"n_features",
"Execution time [s]",
exec_times,
exec_max,
exec_mins,
x_coords,
list(map(str, n_features)),
ticks,
methods if methods is not None else pod.methods,
plot_lines=False,
save_path=save_path)
# go -> confirmed
def _plot_score_box(pod: DataHandler,
dataset: str,
regression_metric,
methods: Union[str, List[str]],
n_features: List[Union[int, Tuple[int, int]]],
save_path: str = None):
regression_metric = _check_specified_or_singleton(pod.reg_metrics, regression_metric,
identifier='regression metric')
dataset = _check_specified_or_singleton(pod.datasets, dataset, identifier='dataset')
n_features = _check_n_features(pod, n_features)
if methods is None:
methods = pod.methods
reg_scores = pod.get_regression_data(method=methods,
n_features=n_features,
dataset=dataset,
reg_metric=regression_metric).to_numpy()
reg_scores = np.reshape(reg_scores, (len(methods), -1, pod.n_runs)).tolist() # shape: methods x n_features x n_runs
# calculate offset x coordinates
x_coords, ticks = _arrange_boxes(pod, n_features, methods)
_box_plot(f'Regression performance box plot on dataset {dataset}',
"n_features",
regression_metric,
reg_scores,
x_coords,
methods if methods is not None else pod.methods,
list(map(str, n_features)),
ticks,
save_path=save_path)
# TODO: adapt
def _plot_score_bar(pod: DataHandler,
dataset: str = None,
regression_metric: str = None,
methods: Union[str, List[str]] = None,
n_features: List[Union[int, Tuple[int, int]]] = None,
item: Literal['mean', 'median'] = 'mean',
save_path: str = None):
regression_metric = _check_specified_or_singleton(pod.reg_metrics, regression_metric,
identifier='regression metric')
dataset = _check_specified_or_singleton(pod.datasets, dataset, identifier='dataset')
if n_features is not None and not isinstance(n_features, list):
n_features = [n_features]
reg_scores = pod.get_regression_data(method=methods,
n_features=n_features,
dataset=dataset,
reg_metric=regression_metric,
item=item).to_numpy()
reg_mins = pod.get_regression_data(method=methods,
dataset=dataset,
n_features=n_features,
reg_metric=regression_metric,
item='min').to_numpy()
reg_max = pod.get_regression_data(method=methods,
dataset=dataset,
n_features=n_features,
reg_metric=regression_metric,
item='max').to_numpy()
# calculate offset x coordinates
x_coords, ticks = _arrange_boxes(pod, n_features, methods)
_errorbar_plot(f'Regression performance: {item} and range on dataset {dataset}',
"n_features",
regression_metric,
reg_scores,
reg_max,
reg_mins,
x_coords,
n_features if n_features is not None else pod.n_features,
ticks,
methods if methods is not None else pod.methods,
plot_lines=False,
save_path=save_path
)
# go -> confirmed
[docs]def plot_score(pod: DataHandler,
dataset: str = None,
regression_metric: str = None,
methods: Union[str, List[str]] = None,
n_features: List[Union[int, Tuple[int, int]]] = None,
item: Literal['mean', 'median'] = 'mean',
plot_type: Literal['box', 'bar'] = 'box',
save_path: str = None):
"""Plot regression scores of selectors across different number of selected features as box or bar plot.
Parameters
----------
pod: DataHandler
:class:`~auswahl.benchmarking.DataHandler` object containing the benchmarking data.
dataset: str, default=None
Identifier of the dataset of which to plot the execution time. If there is data for only one dataset
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
methods: str or list of str, default=None
Identifiers of methods for which to plot the execution time. If None, all available methods are used.
n_features: list of integers or of tuples of integers, default=None
Identifiers of the number of features or the configuration of intervals for which the execution time is to be
plotted. If None, all available feature descriptors are used.
item: Literal['mean', 'median'], default='mean'
Specifies whether the mean or median is displayed in the plot.
plot_type: Literal['box', 'bar'], default='box'
Specifies the requested plot type.
save_path: str, default=None
Path at which the plot has to be saved. If None, the plot is only displayed, not saved.
"""
if plot_type == 'box':
_plot_score_box(pod, dataset, regression_metric, methods, n_features, save_path)
elif plot_type == 'bar':
_plot_score_bar(pod, dataset, regression_metric, methods, n_features, item, save_path)
else:
raise ValueError(f'Unknown plot type {plot_type}.')
# go -> confirmed
[docs]def plot_stability(pod: DataHandler,
dataset: str = None,
stability_metric: str = None,
methods: Union[str, List[str]] = None,
save_path: str = None):
"""Plots the stability score of methods for a given metric.
Parameters
----------
pod: DataHandler
:class:`~auswahl.benchmarking.DataHandler` object containing the benchmarking data.
dataset: str, default=None
Identifier of the dataset of which to plot the execution time. If there is data for only one dataset
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
stability_metric: str, default=None
Stability metric used for plotting. If there is data for only one stability_metric
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
methods: Union[str, List[str]], default=None
Method identifier or list of method identifiers for methods to be plotted. If None, all available methods
are plotted.
save_path: str, default=None
Path on which to store the plot. If None, the plot is simply displayed.
"""
dataset = _check_specified_or_singleton(pod.datasets, dataset, identifier='dataset')
stability_metric = _check_specified_or_singleton(pod.stab_metrics, stability_metric, identifier='stability metric')
y_data = pod.get_stability_data(method=methods, dataset=dataset, stab_metric=stability_metric).to_numpy().tolist()
x_data = list(map(str, pod.feature_descriptors))
_line_plot(f'Stability across n_features to select: Dataset {dataset}',
"n_features",
stability_metric,
y_data,
x_data,
pod.methods,
save_path)
# go
def _plot_selection_bar(pod: DataHandler,
dataset: str,
n_features: Union[int, Tuple[int]],
methods: Union[str, List[str]] = None,
save_path: str = None):
colors = plt.cm.get_cmap('Accent', len(methods) + 1)
fig = plt.figure()
gs = fig.add_gridspec(len(methods), hspace=0)
axs = gs.subplots(sharex=True, sharey=True)
fig.suptitle(f'Selection probability P on dataset {dataset} '
f'for {FeatureDescriptor(n_features, pod.resolve_tuples)} features.')
selections = pod.get_selection_data(dataset=dataset, method=methods, n_features=n_features).to_numpy().tolist()
selections = pd.DataFrame([sum([s.features for s in sel if s.is_valid()], []) for sel in selections])
n_wavelengths = pod.get_meta(dataset)['n_features']
if len(methods) == 1:
axs = [axs]
for i in range(len(methods)):
unique_counts = selections.iloc[i].value_counts()
bar_heights = np.zeros((n_wavelengths,))
bar_heights[unique_counts.index.to_numpy().astype('int')] = unique_counts.to_numpy()
axs[i].bar(np.arange(n_wavelengths), bar_heights / pod.n_runs, color=colors(i))
if i % 2 == 0: # distribute y-axis ticks between left and right-hand side
axs[i].yaxis.tick_right()
else:
axs[i].yaxis.set_label_position("right")
axs[i].set_xlabel("wavelength")
axs[i].set_ylabel("P")
# plot vertical lines
# axs[i].hlines(y=[0.25, 0.5, 0.75], xmin=axs[i].get_xlim()[0], xmax=axs[i].get_xlim()[1],
# linewidth=0.5, color='k', alpha=.5)
# axs[i].grid(axis='y')
axs[i].set_yticks([0.25, 0.5, 0.75])
axs[i].yaxis.grid(True)
axs[i].legend(handles=[mpatches.Patch(color=colors(i), label=methods[i])])
# Hide x labels and tick labels for all but the bottom plot.
for ax in axs:
ax.label_outer()
if save_path is not None:
plt.savefig(save_path)
else:
plt.show()
# go -> confirmed
[docs]def plot_selection(pod: DataHandler,
n_features: Union[int, Tuple[int, int]],
dataset: str = None,
methods: Union[str, List[str]] = None,
plot_type: Literal['heatmap', 'bar'] = 'bar',
save_path: str = None):
"""Plots the selection probability for features of different selectors.
Parameters
----------
pod: DataHandler
:class:`~auswahl.benchmarking.DataHandler` object containing the benchmarking data.
n_features: Union[int, Tuple[int, int]]
feature configuration for which to plot the selection probability
dataset: str, default=None
Identifier of the dataset of which to plot the execution time. If there is data for only one dataset
in the :class:`~auswahl.benchmarking.DataHandler` object, the argument does not have to be specified.
methods: Union[str, List[str]], default=None
Method identifier or list of method identifiers for methods to be plotted. If None, all available methods
are plotted.
plot_type: Literal['heatmap', 'bar'], default='bar'
Plot type displayed. Option 'heatmap' is currently unavailable.
save_path: str, default=None
Path on which to store the plot. If None, the plot is simply displayed.
"""
dataset = _check_specified_or_singleton(pod.datasets, dataset, identifier='dataset')
n_features = _check_specified_or_singleton(pod.feature_descriptors, n_features, identifier='n_features')
if methods is None:
methods = pod.methods
if plot_type == 'bar':
_plot_selection_bar(pod, dataset, n_features, methods, save_path)
elif plot_type == 'heatmap':
#_plot_selection_heatmap(pod, dataset, n_features, methods, save_path)
raise ValueError("The plotting option 'heatmap' is currently not available. Use the option"
"'bar' instead.")
else:
raise ValueError(f'Unknown plot_type passed to function plot_selection: {plot_type}.'
'Use one of {heatmap, bar}')