Source code for skferm.smoothing.metrics

"""
Metrics for evaluating smoothing quality and curve smoothness.

This module provides functions to quantify:
1. How smooth a curve is (total variation metric)
2. How well the smoothed curve fits the original data (RMSE and R²)
"""

from typing import Dict, Optional, Union

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score


[docs] def total_variation(y_values: np.ndarray, normalize: bool = True) -> float: """ Calculate total variation for a sequence of values. Parameters: ----------- y_values : np.ndarray Array of y-values normalize : bool Whether to normalize by the range of values Returns: -------- float Total variation metric """ if len(y_values) < 2: return np.nan # Remove NaN values clean_values = y_values[~np.isnan(y_values)] if len(clean_values) < 2: return np.nan # Calculate total variation tv = np.sum(np.abs(np.diff(clean_values))) if normalize: y_range = np.ptp(clean_values) if y_range == 0: return 0.0 tv = tv / y_range return tv
[docs] def fit_quality_metrics(original: np.ndarray, smoothed: np.ndarray) -> Dict[str, float]: """ Calculate fit quality metrics between original and smoothed data. Parameters: ----------- original : np.ndarray Original data values smoothed : np.ndarray Smoothed data values Returns: -------- Dict[str, float] Dictionary with 'rmse' and 'r2' keys """ if len(original) != len(smoothed): raise ValueError("Original and smoothed arrays must have the same length") # Remove NaN values mask = ~(np.isnan(original) | np.isnan(smoothed)) if np.sum(mask) < 2: return {"rmse": np.nan, "r2": np.nan} original_clean = original[mask] smoothed_clean = smoothed[mask] rmse = float(np.sqrt(mean_squared_error(original_clean, smoothed_clean))) r2 = float(r2_score(original_clean, smoothed_clean)) return {"rmse": rmse, "r2": r2}
[docs] def evaluate_smoothing_quality( df: pd.DataFrame, x_col: str, original_col: str, smoothed_col: str, group_col: Optional[str] = None, ) -> Union[pd.DataFrame, pd.Series]: """ Evaluation of smoothing quality. Parameters: ----------- df : pd.DataFrame DataFrame containing the data x_col : str Column name for x-axis (for sorting) original_col : str Column name for original data smoothed_col : str Column name for smoothed data group_col : Optional[str] Column to group by (returns Series if provided) Returns: -------- pd.DataFrame or pd.Series DataFrame with metrics if group_col is provided, else Series """ # Validate inputs required_cols = [x_col, original_col, smoothed_col] if group_col: required_cols.append(group_col) missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: raise ValueError(f"Missing columns: {missing_cols}") def _calculate_group_metrics(group_df: pd.DataFrame): """Calculate metrics for a single group.""" sorted_df = group_df.sort_values(x_col) original_vals = np.asarray(sorted_df[original_col].values) smoothed_vals = np.asarray(sorted_df[smoothed_col].values) # Calculate smoothness metrics orig_smoothness = total_variation(original_vals) smooth_smoothness = total_variation(smoothed_vals) # Calculate fit quality fit_metrics = fit_quality_metrics(original_vals, smoothed_vals) metrics = { f"{original_col}_smoothness": orig_smoothness, f"{smoothed_col}_smoothness": smooth_smoothness, f"{original_col}_{smoothed_col}_rmse_fit": fit_metrics["rmse"], f"{original_col}_{smoothed_col}_r2_fit": fit_metrics["r2"], } return pd.Series(metrics) if group_col is not None: return df.groupby(group_col).apply(_calculate_group_metrics).reset_index() else: return _calculate_group_metrics(df)