Source code for lsapy.stats

"""Statistics Module."""

from typing import Any

import geopandas as gpd
import numpy as np
import pandas as pd
import regionmask
import xarray as xr

__all__ = ["stats_summary", "spatial_stats_summary"]



[docs]
def stats_summary(
    data: xr.Dataset,
    on_vars: list | None = None,
    on_dims: list | None = None,
    on_dim_values: dict[str, Any] | None = None,
    bins: list | np.ndarray | None = None,
    bins_labels: list | None = None,
    all_bins: bool | None = False,
    cell_area: tuple[float | int, str] | None = None,
    dropna: bool | None = False,
    **kwargs,
) -> pd.DataFrame:
    """
    Generate a descriptive statistics summary of the data.

    Returns a pandas DataFrame of data according to the given parameters.
    The statistics includes count, mean, std, min, max, and 25%, 50%, and 75% percentiles.
    Bins can be provided to further group the data into intervals.

    Parameters
    ----------
    data : xr.Dataset
        The input data.
    on_vars : list, optional
        Variables for which the statistics are calculated. If None (default), all variables are kept.
    on_dims : list, optional
        Dimensions for which the statistics are calculated. If None (default), all dimensions except
        spatial ones (i.e., `lon` or `x` and `lat` or `y`) are kept.
    on_dim_values : sequence, optional
        Values of dimensions to be kept in the summary. If None (default), all values are kept.
    bins : list or np.ndarray, optional
        Bins defining data intervals. If None (default), no binning is performed.
    bins_labels : list, optional
        Labels for the bins. If None (default), bins values are used as labels.
        The length of the list must be equal to the number of bins. Ignored if `bins` is None.
    all_bins : bool, optional
        If True, a additional bin corresponding to the bounds of `bins` is added. Default is False.
        Ignored if `bins` is None.
    cell_area : tuple of float or int and str, optional
        Add a column to the summary with the given associated area calculated based on the count statistic
        variable. The tuple must contain the area value and the unit of the area.
    dropna : bool, optional
        If True, dimensions with NaN values are removed. Default is False.
    **kwargs : dict, optional
        Additional keyword arguments passed to `pd.cut` used to bin the data.

    Returns
    -------
    pd.DataFrame
        A DataFrame with the statistics for the defined dimensions and variables, including:
        count, mean, std, min, max, and 25%, 50%, and 75% percentiles.

    Examples
    --------
    >>> from lsapy.utils import open_data
    >>> from lsapy import SuitabilityCriteria, LandSuitabilityAnalysis
    >>> import lsapy.standardize as lstd
    >>> from xclim.indicators.atmos import growing_degree_days

    Let's first define a Land Suitability Analysis (LSA):

    >>> drainage = open_data("land", variables="drainage")
    >>> tas = open_data("climate", variables="tas")
    >>> sc = {
    ...     "drainage_class": SuitabilityCriteria(
    ...         name="drainage_class",
    ...         long_name="Drainage Class Suitability",
    ...         weight=3,
    ...         category="soilTerrain",
    ...         indicator=drainage,
    ...         func=lstd.discrete,
    ...         fparams={"rules": {0: 0, 1: 0.1, 2: 0.5, 3: 0.9, 4: 1}},
    ...     ),
    ...     "growing_degree_days": SuitabilityCriteria(
    ...         name="growing_degree_days",
    ...         long_name="Growing Degree Days Suitability",
    ...         weight=1,
    ...         category="climate",
    ...         indicator=growing_degree_days(tas, thresh="10 degC", freq="YS-JUL"),
    ...         func=lstd.vetharaniam2022_eq5,
    ...         fparams={"a": -1.41, "b": 801},
    ...     ),
    ... }
    >>> lsa = LandSuitabilityAnalysis("land_use", sc)
    >>> lsa.run(inplace=True)

    We can then compute the statistics summary for the data:

    >>> stats = stats_summary(lsa.data)

    `on_vars`, `on_dims`, and `on_dim_values` parameters can be used to filter the data.
    If we want to get the statistics summary for only 'growing_degree_days', 'suitability', and
    the first three years, we can do:

    >>> stats = stats_summary(
    ...     lsa.data,
    ...     on_vars=["growing_degree_days", "suitability"],  # select variables
    ...     on_dim_values={"time": slice("2000", "2002")},  # select values of the time dimension
    ... )

    This will compute the statistics for the two variables and for each year of the 2000-2002 period.
    We can also provide bins to group the data into intervals. For example, if we want to get the statistics
    for four bins (0-0.25, 0.25-0.5, 0.5-0.75, 0.75-1), we can do:

    >>> stats = stats_summary(
    ...     lsa.data,
    ...     bins=[0, 0.25, 0.5, 0.75, 1],  # define bins
    ...     bins_labels=["unsuitable", "poorly suitable", "moderately suitable", "highly suitable"],  # define labels
    ...     all_bins=True,  # add an additional bin for the overall range (i.e., 0-1)
    ... )

    Finally, we can get the area associated with each bin by providing the area of each cell in the data.
    Assuming that each cell has an area of 5 hectares (ha), we can do:

    >>> stats = stats_summary(
    ...     lsa.data,
    ...     bins=[0, 0.25, 0.5, 0.75, 1],
    ...     bins_labels=["unsuitable", "poorly suitable", "moderately suitable", "highly suitable"],
    ...     all_bins=True,
    ...     cell_area=(5, "ha"),  # define the area of each cell
    ... )
    """

    def _close_lowest_bin(x: pd.Series, bins) -> pd.Series:
        first_cat = x.cat.categories[0]
        lf = first_cat.left + (abs(first_cat.left) - bins[0])
        return x.cat.rename_categories({first_cat: pd.Interval(lf, first_cat.right, closed="both")})

    if bins is not None and bins_labels is not None and (len(bins) - 1 != len(bins_labels)):
        raise ValueError("bins and bins_labels must have the same length")

    if on_dim_values is not None:
        for dim, value in on_dim_values.items():
            data = data.sel({dim: value})

    if on_vars is None:
        on_vars = list(data.data_vars)
    data = data[on_vars]
    if on_dims is None:
        on_dims = [d for d in data.dims if d not in ["lat", "lon", "x", "y"]]  # remove spatial dims
    if cell_area:
        cell_value, cell_unit = cell_area

    df = data.to_dataframe().reset_index()
    if len(on_dims) > 0:
        df = df.drop(columns=[c for c in data.coords if c not in on_dims])

    df = df.melt(id_vars=on_dims)
    _dims = ["variable"] + on_dims

    if bins is not None:
        df["bin"] = pd.cut(df["value"], bins=pd.Index(bins), **kwargs)
        if "include_lowest" in kwargs and kwargs["include_lowest"]:
            df["bin"] = _close_lowest_bin(df["bin"], bins)

        if bins_labels is not None:
            lab_mapping = dict(zip(df["bin"].cat.categories.astype(str), bins_labels, strict=False))
        _dims.append("bin")
        if all_bins:
            all_bins_inter = pd.Interval(
                df["bin"].cat.categories[0].left, df["bin"].cat.categories[-1].right, closed="both"
            )
            df_ = df.drop(columns=["bin"]).assign(bin=all_bins_inter)
            df_.loc[df["value"].isnull(), "bin"] = np.nan
            if bins_labels is not None:
                lab_mapping.update({str(all_bins_inter): "bins_range"})
            df = pd.concat([df, df_])
        df["bin"] = df["bin"].astype(str)

    df_out = df.groupby(_dims, observed=False).describe().droplevel(0, axis=1).reset_index()

    if bins_labels is not None:
        bin_idx = np.where(df_out.columns == "bin")[0][0]
        df_out.insert(bin_idx + 1, "bin_label", df_out["bin"].map(lab_mapping).values)

    if cell_area:
        df_out[f"area_{cell_unit}"] = df_out["count"] * cell_value

    if dropna:
        return df_out.dropna()
    return df_out




[docs]
def spatial_stats_summary(
    data: xr.DataArray | xr.Dataset,
    areas: gpd.GeoDataFrame,
    name: str = "area",
    mask_kwargs: dict[str, Any] | None = None,
    **kwargs,
) -> pd.DataFrame:
    """
    Generate a descriptive statistics summary of the data for given areas.

    Returns a pandas DataFrame of data according to the given parameters.
    The statistics includes count, mean, std, min, max, and 25%, 50%, and 75% percentiles.
    Bins can be provided to further group the data into intervals.
    The statistics are calculated for each area provided in the `areas` GeoDataFrame.

    Parameters
    ----------
    data : xr.DataArray | xr.Dataset
        The input data.
    areas : gpd.GeoDataFrame
        Areas to be used as spatial masks.
    name : str, optional
        Name of the area column in the output DataFrame. Default is 'area'.
    mask_kwargs : dict, optional
        Additional keyword arguments passed to `regionmask.from_geopandas`.
    **kwargs : dict, optional
        Additional keyword arguments passed to `lsapy.stats.stats_summary`.

    Returns
    -------
    pd.DataFrame
        A DataFrame with the statistics for each area, including:
        count, mean, std, min, max, and 25%, 50%, and 75% percentiles.

    Examples
    --------
    >>> from lsapy.utils import open_data
    >>> from lsapy import SuitabilityCriteria, LandSuitabilityAnalysis
    >>> import lsapy.standardize as lstd
    >>> from xclim.indicators.atmos import growing_degree_days
    >>> import geopandas as gpd

    Let's first define a Land Suitability Analysis (LSA):

    >>> drainage = open_data("land", variables="drainage")
    >>> tas = open_data("climate", variables="tas")
    >>> sc = {
    ...     "drainage_class": SuitabilityCriteria(
    ...         name="drainage_class",
    ...         long_name="Drainage Class Suitability",
    ...         weight=3,
    ...         category="soilTerrain",
    ...         indicator=drainage,
    ...         func=lstd.discrete,
    ...         fparams={"rules": {0: 0, 1: 0.1, 2: 0.5, 3: 0.9, 4: 1}},
    ...     ),
    ...     "growing_degree_days": SuitabilityCriteria(
    ...         name="growing_degree_days",
    ...         long_name="Growing Degree Days Suitability",
    ...         weight=1,
    ...         category="climate",
    ...         indicator=growing_degree_days(tas, thresh="10 degC", freq="YS-JUL"),
    ...         func=lstd.vetharaniam2022_eq5,
    ...         fparams={"a": -1.41, "b": 801},
    ...     ),
    ... }
    >>> lsa = LandSuitabilityAnalysis("land_use", sc)
    >>> lsa.run(inplace=True)

    We can then load a GeoDataFrame of areas and compute the statistics summary for each area.

    >>> areas = gpd.read_file("path_to_your_areas_file.shp")  # doctest: +SKIP
    >>> stats = spatial_stats_summary(lsa.data, areas)  # doctest: +SKIP
    """
    if mask_kwargs is None:
        mask_kwargs = {}

    regions = regionmask.from_geopandas(areas, name=name, **mask_kwargs)
    mask = regions.mask_3D(data)

    out = []
    for r in mask["region"].values:
        df = stats_summary(data.where(mask.sel(region=r)), **kwargs)
        df.insert(0, name, regions[r].name)
        out.append(df)
    return pd.concat(out)