Source code for skferm.smoothing.metrics

"""
Metrics for evaluating smoothing quality and curve smoothness.

This module provides functions to quantify:
1. How smooth a curve is (total variation metric)
2. How well the smoothed curve fits the original data (RMSE and R²)
"""

from typing import Dict, Optional, Union

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score



[docs]
def total_variation(y_values: np.ndarray, normalize: bool = True) -> float:
    """
    Calculate total variation for a sequence of values.

    Parameters:
    -----------
    y_values : np.ndarray
        Array of y-values
    normalize : bool
        Whether to normalize by the range of values

    Returns:
    --------
    float
        Total variation metric
    """
    if len(y_values) < 2:
        return np.nan

    # Remove NaN values
    clean_values = y_values[~np.isnan(y_values)]
    if len(clean_values) < 2:
        return np.nan

    # Calculate total variation
    tv = np.sum(np.abs(np.diff(clean_values)))

    if normalize:
        y_range = np.ptp(clean_values)
        if y_range == 0:
            return 0.0
        tv = tv / y_range

    return tv




[docs]
def fit_quality_metrics(original: np.ndarray, smoothed: np.ndarray) -> Dict[str, float]:
    """
    Calculate fit quality metrics between original and smoothed data.

    Parameters:
    -----------
    original : np.ndarray
        Original data values
    smoothed : np.ndarray
        Smoothed data values

    Returns:
    --------
    Dict[str, float]
        Dictionary with 'rmse' and 'r2' keys
    """
    if len(original) != len(smoothed):
        raise ValueError("Original and smoothed arrays must have the same length")

    # Remove NaN values
    mask = ~(np.isnan(original) | np.isnan(smoothed))
    if np.sum(mask) < 2:
        return {"rmse": np.nan, "r2": np.nan}

    original_clean = original[mask]
    smoothed_clean = smoothed[mask]

    rmse = float(np.sqrt(mean_squared_error(original_clean, smoothed_clean)))
    r2 = float(r2_score(original_clean, smoothed_clean))

    return {"rmse": rmse, "r2": r2}




[docs]
def evaluate_smoothing_quality(
    df: pd.DataFrame,
    x_col: str,
    original_col: str,
    smoothed_col: str,
    group_col: Optional[str] = None,
) -> Union[pd.DataFrame, pd.Series]:
    """
    Evaluation of smoothing quality.

    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing the data
    x_col : str
        Column name for x-axis (for sorting)
    original_col : str
        Column name for original data
    smoothed_col : str
        Column name for smoothed data
    group_col : Optional[str]
        Column to group by (returns Series if provided)

    Returns:
    --------
    pd.DataFrame or pd.Series
        DataFrame with metrics if group_col is provided, else Series
    """
    # Validate inputs
    required_cols = [x_col, original_col, smoothed_col]
    if group_col:
        required_cols.append(group_col)

    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns: {missing_cols}")

    def _calculate_group_metrics(group_df: pd.DataFrame):
        """Calculate metrics for a single group."""
        sorted_df = group_df.sort_values(x_col)

        original_vals = np.asarray(sorted_df[original_col].values)
        smoothed_vals = np.asarray(sorted_df[smoothed_col].values)

        # Calculate smoothness metrics
        orig_smoothness = total_variation(original_vals)
        smooth_smoothness = total_variation(smoothed_vals)

        # Calculate fit quality
        fit_metrics = fit_quality_metrics(original_vals, smoothed_vals)

        metrics = {
            f"{original_col}_smoothness": orig_smoothness,
            f"{smoothed_col}_smoothness": smooth_smoothness,
            f"{original_col}_{smoothed_col}_rmse_fit": fit_metrics["rmse"],
            f"{original_col}_{smoothed_col}_r2_fit": fit_metrics["r2"],
        }

        return pd.Series(metrics)

    if group_col is not None:
        return df.groupby(group_col).apply(_calculate_group_metrics).reset_index()
    else:
        return _calculate_group_metrics(df)