Source code for lsapy.standardize

"""Standardization Functions Module."""

import operator
import warnings

import numpy as np
from scipy.optimize import curve_fit

from lsapy.core.functions import declare_equation, equations, get_function_from_name

try:
    import matplotlib.pyplot as plt
except ImportError:
    plt = None
    warnings.warn("Matplotlib not found. Plotting functionality will be disabled.", stacklevel=2)

__all__ = [
    "boolean",
    "discrete",
    "logistic",
    "sigmoid",
    "vetharaniam2022_eq3",
    "vetharaniam2022_eq5",
    "vetharaniam2024_eq8",
    "vetharaniam2024_eq10",
    "fit",
]

FIT_TYPES = ["sigmoid", "gaussian"]  # only sigmoid and gaussian need to be fitted
binary_ops = {">": "gt", "<": "lt", ">=": "ge", "<=": "le", "==": "eq", "!=": "ne"}



[docs]
@declare_equation("boolean")
def boolean(x, op: str, thresh: int | float, skipna: bool = True):
    """
    Boolean function.

    This function applies a boolean operation to the values based on a threshold.

    Parameters
    ----------
    x : any
        Input values.
    op : {">", "gt", "<", "lt", ">=", "ge", "<=", "le", "==", "eq", "!=", "ne"}
        Logical operator.
    thresh : any
        Threshold value.
    skipna : bool, optional
        Whether to skip NaN values. If True, NaN values in `x` will remain NaN in the output. Default is True.

    Returns
    -------
    np.ndarray
        Boolean mask of the operation.
    """
    if op in binary_ops:
        op = binary_ops[op]
    elif op in binary_ops.values():
        pass
    else:
        raise ValueError(f"Operator '{op}' not recognized.")

    res = getattr(operator, op)(x, thresh)
    if skipna:
        mask = np.isnan(x)
        return np.where(mask, np.nan, res)
    return res




[docs]
@declare_equation("categorical")
def discrete(x, rules: dict[str | int, int | float]) -> np.ndarray:
    """
    Discrete function.

    This function maps input values to a set of rules that define the output values.

    Parameters
    ----------
    x : any
        Input values to map.
    rules : dict[str | int, int | float]
        Rules to map the input values to output values. The keys correspond to the input values and the
        values to its associated output values.

    Returns
    -------
    np.ndarray
        Mapped output values.
    """
    return np.vectorize(rules.get)(x, np.nan)




[docs]
@declare_equation("sigmoid")
def logistic(x, a, b) -> np.ndarray:
    r"""
    Logistic function capped to 1.

    This function should be used on sigmoid-like suitability data. The function can be used on both
    positive and negative values of `x`, as well as on increasing and decreasing sigmoid-like data.

    Parameters
    ----------
    x : any
        Input values.
    a : float | int
        Steepness of the function parameter.
    b : float | int
        Value of the function's midpoint.

    Returns
    -------
    float
        Output values.

    Notes
    -----
    The function is defined as:

    .. math::

        f(x) = \frac{1}{1 + e^{-a(x - b)}}
    """
    return 1 / (1 + np.exp(-a * np.subtract(x, b)))




[docs]
@declare_equation("sigmoid")
def sigmoid(x) -> np.ndarray:
    r"""
    Logistic sigmoid function.

    This function is a special case of the logistic function with `a=1` and `b=0`, thus can be used on both
    positive and negative values of `x`, however only for increasing sigmoid-like data.

    Parameters
    ----------
    x : any
        Input values.

    Returns
    -------
    float
        Output values.

    See Also
    --------
    :func:`logistic`

    Notes
    -----
    The sigmoid function is defined as:

    .. math::

        f(x) = \frac{1}{1 + e^{-x}}
    """
    return logistic(x, 1, 0)




[docs]
@declare_equation("sigmoid", "VTR22_eq3")
def vetharaniam2022_eq3(x, a, b) -> np.ndarray:
    r"""
    Sigmoid like function.

    This function is equivalent to the logistic function and thus can be used on both positive and negative values of
    `x`, as well as on increasing and decreasing sigmoid-like data.

    Parameters
    ----------
    x : any
        Input values.
    a : float | int
        Steepness of the function parameter.
    b : float | int
        Value of the function's midpoint.

    Returns
    -------
    float
        Output values.

    See Also
    --------
    :func:`logistic`

    Notes
    -----
    Alternative name: `VTR22_eq3`.
    This function has been implemented to support reproductiblity of the original paper. However, as it is equivalent to
    the more commonly used `logistic` function, it is recommended to use the `logistic` function instead.
    This function is defined as:

    .. math::

        f(x) = \frac{e^{a(x - b)}}{1 + e^{a(x - b)}}

    References
    ----------
    :cite:cts:`vetharaniam_lsa_2022`
    """
    return np.exp(a * np.subtract(x, b)) / (1 + np.exp(a * np.subtract(x, b)))




[docs]
@declare_equation("sigmoid", "VTR22_eq5")
def vetharaniam2022_eq5(x, a, b) -> np.ndarray:
    r"""
    Sigmoid like function.

    This function is a modified version of the logistic function that can for both increasing and decreasing
    sigmoid-like data, but only for positive values of `x`.

    Parameters
    ----------
    x : any
        Input values.
    a : float | int
        Steepness of the function parameter.
    b : float | int
        Value of the function's midpoint.

    Returns
    -------
    float
        Output values.

    Notes
    -----
    Alternative name: `VTR22_eq5`.
    The sigmoid like function is defined as:

    .. math::

        f(x) = \frac{1}{1 + e^{a(\sqrt{x} - \sqrt{b})}}

    References
    ----------
    :cite:cts:`vetharaniam_lsa_2022`
    """
    return 1 / (1 + np.exp(a * (np.sqrt(x) - np.sqrt(b))))




[docs]
@declare_equation("gaussian", "VTR24_eq8")
def vetharaniam2024_eq8(x, a, b, c) -> np.ndarray:
    r"""
    Gaussian like function.

    This function should be used on Gaussian-like data, either positive or negative, and allows
    to have a plateau at around the midpoint.

    Parameters
    ----------
    x : any
        Input values to map.
    a : float | int
        Steepness of the function parameter. Should be a positive number.
    b : float | int
        Value of the function's midpoint.
    c : float | int
        Scaling parameter. Should be a even number. If negative, the function will be flipped.

    Returns
    -------
    float
        Suitability values.

    Notes
    -----
    Alternative name: `VTR24_eq8`.
    The Gaussian like function is defined as:

    .. math::

        f(x) = e^{-a(x - b)^c}

    References
    ----------
    :cite:cts:`vetharaniam_lsa_2024`
    """
    return np.exp(-a * np.power(np.subtract(x, b), c))




[docs]
@declare_equation("gaussian", "VTR24_eq10")
def vetharaniam2024_eq10(x, a, b, c) -> np.ndarray:
    r"""
    Gaussian like function.

    This function should be used on Gaussian-like data and allows asymmetric distribution. However,
    it only works for positive values of `x`.

    Parameters
    ----------
    x : any
        Input values.
    a : float | int
        Steepness of the function parameter.
    b : float | int
        Value of the function's midpoint.
    c : float | int
        Scaling parameter. Should be a positive number.

    Returns
    -------
    float
        Output values.

    Notes
    -----
    Alternative name: `VTR24_eq10`.
    The Gaussian like function is defined as:

    .. math::

        f(x) = e^{-a(x^c - b^c)}

    References
    ----------
    :cite:cts:`vetharaniam_lsa_2024`
    """
    return 2 / (1 + np.exp(a * np.power(np.power(x, c) - np.power(b, c), 2)))




[docs]
def fit(x, y=None, kind: str | list[str] = "all", plot: bool = False, verbose: bool = False):
    """
    Fit standardization functions to data.

    This function fits membership functions to the provided data. It helps to determine the best membership function
    to use on the data.

    Parameters
    ----------
    x : any
        Input values to fit the functions on.
    y : any, optional
        Target values to fit the functions. Should be the same length as `x`. If not provided,
        the default values are used (0, 0.25, 0.5, 0.75, 1).
    kind : str | list[str], optional
        List of functions or function types to fit. If '{TYPES}_like', all equations corresponding to the
        type are fitted (available types: 'sigmoid', 'gaussian'). If 'all', all available equations are fitted.
    plot : bool, optional
        Whether to plot the fitted functions. Default is False.
    verbose : bool, optional
        Whether to print the fitting results. Default is False.

    Returns
    -------
    tuple
        A tuple containing the best fitting function and its parameters.
    """
    if y is None:
        y = [0, 0.25, 0.5, 0.75, 1]
    y = np.array(y)
    functions, skipped = _check_fitting(kind)

    x_ = np.linspace(min(x), max(x), 100)
    rms_errors = []
    f_params = []
    for func in functions:
        try:
            f = get_function_from_name(func)
            p0 = _get_function_p0(func, x)
            popt, _ = curve_fit(f, x, y, p0=p0, maxfev=15000)
            y_ = f(x_, *popt)
            f_params.append(popt)
            rmse = _rmse(y, f(x, *popt))
            rms_errors.append(rmse)
            if plot:
                plt.plot(x_, y_, label=func + f" (RMSE={rmse:.2f})")
        except Exception:
            skipped.append(func)
            warnings.warn(f"Failed to fit `{func}`. Skipped.", stacklevel=2)

    if all([f in skipped for f in functions]):
        warnings.warn(f"No methods to fit. Skipping: {', '.join(skipped)}.", stacklevel=2)
        return None, None

    if plot and plt is not None:
        plt.scatter(x, y, c="r")
        plt.legend()

    f_best, p_best = _get_best_fit([m for m in functions if m not in skipped], rms_errors, f_params, verbose=verbose)
    return get_function_from_name(f_best), p_best



def _check_fitting(kind="all") -> tuple[list[str], list[str]]:
    _types = [t + "_like" for t in FIT_TYPES]
    _skipped = []

    if not isinstance(kind, str) and not isinstance(kind, list):
        raise ValueError(f"`kind` should be a str or a list of string. Got {type(kind)}")

    functions: list[str] = []
    if isinstance(kind, str):
        if kind == "all":
            for t in FIT_TYPES:
                functions.extend(equations[t].keys())
                kind = None
        else:
            kind = [kind]
    if kind is not None:
        for func in kind:
            if not isinstance(func, str):
                continue
            if func in _types:
                for f in equations[func.replace("_like", "")].keys():
                    if f not in functions:
                        functions.append(f)
            else:
                try:
                    get_function_from_name(func)
                    if func not in functions:
                        functions.append(func)
                except Exception:
                    _skipped.append(func)
                    warnings.warn(f"`{func}` not found in implemented equations. Skipped.", stacklevel=3)

    for f in ["sigmoid", "vetharaniam2024_eq8"]:
        if f in functions:
            functions.remove(f)
            _skipped.append(f)
            if f == "sigmoid":
                warnings.warn("No parameters to determine for `sigmoid`. Skipped.", stacklevel=3)
            if f == "vetharaniam2024_eq8":
                warnings.warn("Fitting does not support `vetharaniam2024_eq8`. Skipped.", stacklevel=3)

    if len(functions) == 0:
        raise ValueError("No functions to fit. Try to modify `kind` parameter.")
    return functions, _skipped


def _get_function_p0(func: str, x: np.ndarray) -> list[float]:
    if func in equations["sigmoid"]:
        return [1, np.median(x)]
    if func in equations["gaussian"]:
        return [1, np.median(x), 1]
    return []


def _rmse(y_true, y_pred) -> float:
    diff = abs(y_true - y_pred)
    return np.sqrt(np.mean(diff**2))


def _get_best_fit(functions, rmse, params, verbose=True) -> tuple[str, list[float]]:
    best_fit = np.nanargmin(rmse)
    if verbose:
        print(f"""
Best fit: {functions[best_fit]}
RMSE: {rmse[best_fit]:.5f}
Params: a={params[best_fit][0]}, b={params[best_fit][1]}
""")
    return functions[best_fit], params[best_fit]