Source code for lsapy.stats

"""Statistics Module."""

from typing import Any

import geopandas as gpd
import numpy as np
import pandas as pd
import regionmask
import xarray as xr

__all__ = ["stats_summary", "spatial_stats_summary"]


[docs] def stats_summary( data: xr.Dataset, on_vars: list | None = None, on_dims: list | None = None, on_dim_values: dict[str, Any] | None = None, bins: list | np.ndarray | None = None, bins_labels: list | None = None, all_bins: bool | None = False, cell_area: tuple[float | int, str] | None = None, dropna: bool | None = False, **kwargs, ) -> pd.DataFrame: """ Generate a descriptive statistics summary of the data. Returns a pandas DataFrame of data according to the given parameters. The statistics includes count, mean, std, min, max, and 25%, 50%, and 75% percentiles. Bins can be provided to further group the data into intervals. Parameters ---------- data : xr.Dataset The input data. on_vars : list, optional Variables for which the statistics are calculated. If None (default), all variables are kept. on_dims : list, optional Dimensions for which the statistics are calculated. If None (default), all dimensions except spatial ones (i.e., `lon` or `x` and `lat` or `y`) are kept. on_dim_values : sequence, optional Values of dimensions to be kept in the summary. If None (default), all values are kept. bins : list or np.ndarray, optional Bins defining data intervals. If None (default), no binning is performed. bins_labels : list, optional Labels for the bins. If None (default), bins values are used as labels. The length of the list must be equal to the number of bins. Ignored if `bins` is None. all_bins : bool, optional If True, a additional bin corresponding to the bounds of `bins` is added. Default is False. Ignored if `bins` is None. cell_area : tuple of float or int and str, optional Add a column to the summary with the given associated area calculated based on the count statistic variable. The tuple must contain the area value and the unit of the area. dropna : bool, optional If True, dimensions with NaN values are removed. Default is False. **kwargs : dict, optional Additional keyword arguments passed to `pd.cut` used to bin the data. Returns ------- pd.DataFrame A DataFrame with the statistics for the defined dimensions and variables, including: count, mean, std, min, max, and 25%, 50%, and 75% percentiles. Examples -------- >>> from lsapy.utils import open_data >>> from lsapy import SuitabilityCriteria, LandSuitabilityAnalysis >>> import lsapy.standardize as lstd >>> from xclim.indicators.atmos import growing_degree_days Let's first define a Land Suitability Analysis (LSA): >>> drainage = open_data("land", variables="drainage") >>> tas = open_data("climate", variables="tas") >>> sc = { ... "drainage_class": SuitabilityCriteria( ... name="drainage_class", ... long_name="Drainage Class Suitability", ... weight=3, ... category="soilTerrain", ... indicator=drainage, ... func=lstd.discrete, ... fparams={"rules": {0: 0, 1: 0.1, 2: 0.5, 3: 0.9, 4: 1}}, ... ), ... "growing_degree_days": SuitabilityCriteria( ... name="growing_degree_days", ... long_name="Growing Degree Days Suitability", ... weight=1, ... category="climate", ... indicator=growing_degree_days(tas, thresh="10 degC", freq="YS-JUL"), ... func=lstd.vetharaniam2022_eq5, ... fparams={"a": -1.41, "b": 801}, ... ), ... } >>> lsa = LandSuitabilityAnalysis("land_use", sc) >>> lsa.run(inplace=True) We can then compute the statistics summary for the data: >>> stats = stats_summary(lsa.data) `on_vars`, `on_dims`, and `on_dim_values` parameters can be used to filter the data. If we want to get the statistics summary for only 'growing_degree_days', 'suitability', and the first three years, we can do: >>> stats = stats_summary( ... lsa.data, ... on_vars=["growing_degree_days", "suitability"], # select variables ... on_dim_values={"time": slice("2000", "2002")}, # select values of the time dimension ... ) This will compute the statistics for the two variables and for each year of the 2000-2002 period. We can also provide bins to group the data into intervals. For example, if we want to get the statistics for four bins (0-0.25, 0.25-0.5, 0.5-0.75, 0.75-1), we can do: >>> stats = stats_summary( ... lsa.data, ... bins=[0, 0.25, 0.5, 0.75, 1], # define bins ... bins_labels=["unsuitable", "poorly suitable", "moderately suitable", "highly suitable"], # define labels ... all_bins=True, # add an additional bin for the overall range (i.e., 0-1) ... ) Finally, we can get the area associated with each bin by providing the area of each cell in the data. Assuming that each cell has an area of 5 hectares (ha), we can do: >>> stats = stats_summary( ... lsa.data, ... bins=[0, 0.25, 0.5, 0.75, 1], ... bins_labels=["unsuitable", "poorly suitable", "moderately suitable", "highly suitable"], ... all_bins=True, ... cell_area=(5, "ha"), # define the area of each cell ... ) """ def _close_lowest_bin(x: pd.Series, bins) -> pd.Series: first_cat = x.cat.categories[0] lf = first_cat.left + (abs(first_cat.left) - bins[0]) return x.cat.rename_categories({first_cat: pd.Interval(lf, first_cat.right, closed="both")}) if bins is not None and bins_labels is not None and (len(bins) - 1 != len(bins_labels)): raise ValueError("bins and bins_labels must have the same length") if on_dim_values is not None: for dim, value in on_dim_values.items(): data = data.sel({dim: value}) if on_vars is None: on_vars = list(data.data_vars) data = data[on_vars] if on_dims is None: on_dims = [d for d in data.dims if d not in ["lat", "lon", "x", "y"]] # remove spatial dims if cell_area: cell_value, cell_unit = cell_area df = data.to_dataframe().reset_index() if len(on_dims) > 0: df = df.drop(columns=[c for c in data.coords if c not in on_dims]) df = df.melt(id_vars=on_dims) _dims = ["variable"] + on_dims if bins is not None: df["bin"] = pd.cut(df["value"], bins=pd.Index(bins), **kwargs) if "include_lowest" in kwargs and kwargs["include_lowest"]: df["bin"] = _close_lowest_bin(df["bin"], bins) if bins_labels is not None: lab_mapping = dict(zip(df["bin"].cat.categories.astype(str), bins_labels, strict=False)) _dims.append("bin") if all_bins: all_bins_inter = pd.Interval( df["bin"].cat.categories[0].left, df["bin"].cat.categories[-1].right, closed="both" ) df_ = df.drop(columns=["bin"]).assign(bin=all_bins_inter) df_.loc[df["value"].isnull(), "bin"] = np.nan if bins_labels is not None: lab_mapping.update({str(all_bins_inter): "bins_range"}) df = pd.concat([df, df_]) df["bin"] = df["bin"].astype(str) df_out = df.groupby(_dims, observed=False).describe().droplevel(0, axis=1).reset_index() if bins_labels is not None: bin_idx = np.where(df_out.columns == "bin")[0][0] df_out.insert(bin_idx + 1, "bin_label", df_out["bin"].map(lab_mapping).values) if cell_area: df_out[f"area_{cell_unit}"] = df_out["count"] * cell_value if dropna: return df_out.dropna() return df_out
[docs] def spatial_stats_summary( data: xr.DataArray | xr.Dataset, areas: gpd.GeoDataFrame, name: str = "area", mask_kwargs: dict[str, Any] | None = None, **kwargs, ) -> pd.DataFrame: """ Generate a descriptive statistics summary of the data for given areas. Returns a pandas DataFrame of data according to the given parameters. The statistics includes count, mean, std, min, max, and 25%, 50%, and 75% percentiles. Bins can be provided to further group the data into intervals. The statistics are calculated for each area provided in the `areas` GeoDataFrame. Parameters ---------- data : xr.DataArray | xr.Dataset The input data. areas : gpd.GeoDataFrame Areas to be used as spatial masks. name : str, optional Name of the area column in the output DataFrame. Default is 'area'. mask_kwargs : dict, optional Additional keyword arguments passed to `regionmask.from_geopandas`. **kwargs : dict, optional Additional keyword arguments passed to `lsapy.stats.stats_summary`. Returns ------- pd.DataFrame A DataFrame with the statistics for each area, including: count, mean, std, min, max, and 25%, 50%, and 75% percentiles. Examples -------- >>> from lsapy.utils import open_data >>> from lsapy import SuitabilityCriteria, LandSuitabilityAnalysis >>> import lsapy.standardize as lstd >>> from xclim.indicators.atmos import growing_degree_days >>> import geopandas as gpd Let's first define a Land Suitability Analysis (LSA): >>> drainage = open_data("land", variables="drainage") >>> tas = open_data("climate", variables="tas") >>> sc = { ... "drainage_class": SuitabilityCriteria( ... name="drainage_class", ... long_name="Drainage Class Suitability", ... weight=3, ... category="soilTerrain", ... indicator=drainage, ... func=lstd.discrete, ... fparams={"rules": {0: 0, 1: 0.1, 2: 0.5, 3: 0.9, 4: 1}}, ... ), ... "growing_degree_days": SuitabilityCriteria( ... name="growing_degree_days", ... long_name="Growing Degree Days Suitability", ... weight=1, ... category="climate", ... indicator=growing_degree_days(tas, thresh="10 degC", freq="YS-JUL"), ... func=lstd.vetharaniam2022_eq5, ... fparams={"a": -1.41, "b": 801}, ... ), ... } >>> lsa = LandSuitabilityAnalysis("land_use", sc) >>> lsa.run(inplace=True) We can then load a GeoDataFrame of areas and compute the statistics summary for each area. >>> areas = gpd.read_file("path_to_your_areas_file.shp") # doctest: +SKIP >>> stats = spatial_stats_summary(lsa.data, areas) # doctest: +SKIP """ if mask_kwargs is None: mask_kwargs = {} regions = regionmask.from_geopandas(areas, name=name, **mask_kwargs) mask = regions.mask_3D(data) out = [] for r in mask["region"].values: df = stats_summary(data.where(mask.sel(region=r)), **kwargs) df.insert(0, name, regions[r].name) out.append(df) return pd.concat(out)