mgplot.summary_plot

summary_plot.py: Produce a summary plot for the data in a given DataFrame. The data is normalised to z-scores and scaled.

  1"""
  2summary_plot.py:
  3Produce a summary plot for the data in a given DataFrame.
  4The data is normalised to z-scores and scaled.
  5"""
  6
  7# --- imports
  8# system imports
  9from typing import Any
 10
 11# from collections.abc import Sequence
 12
 13# analytic third-party imports
 14from numpy import ndarray, array
 15from matplotlib.pyplot import Axes, subplots
 16from pandas import DataFrame, Period
 17
 18# local imports
 19from mgplot.settings import DataT
 20from mgplot.finalise_plot import make_legend
 21from mgplot.utilities import constrain_data, check_clean_timeseries
 22from mgplot.kw_type_checking import (
 23    report_kwargs,
 24    ExpectedTypeDict,
 25    validate_expected,
 26    validate_kwargs,
 27)
 28from mgplot.keyword_names import (
 29    AX,
 30    VERBOSE,
 31    MIDDLE,
 32    PLOT_TYPE,
 33    PLOT_FROM
 34)
 35
 36
 37# --- constants
 38ZSCORES = "zscores"
 39ZSCALED = "zscaled"
 40
 41SUMMARY_KW_TYPES: ExpectedTypeDict = {
 42    AX: (Axes, type(None)),
 43    VERBOSE: bool,
 44    MIDDLE: float,
 45    PLOT_TYPE: str,
 46    PLOT_FROM: (int, Period, type(None)),
 47}
 48validate_expected(SUMMARY_KW_TYPES, "summary_plot")
 49
 50
 51# --- functions
 52def _calc_quantiles(middle: float) -> ndarray:
 53    """Calculate the quantiles for the middle of the data."""
 54    return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0])
 55
 56
 57def _calculate_z(
 58    original: DataFrame,  # only contains the data points of interest
 59    middle: float,  # middle proportion of data to highlight (eg. 0.8)
 60    verbose: bool = False,  # print the summary data
 61) -> tuple[DataFrame, DataFrame]:
 62    """Calculate z-scores, scaled z-scores and middle quantiles.
 63    Return z_scores, z_scaled, q (which are the quantiles for the
 64    start/end of the middle proportion of data to highlight)."""
 65
 66    # calculate z-scores, scaled scores and middle quantiles
 67    z_scores: DataFrame = (original - original.mean()) / original.std()
 68    z_scaled: DataFrame = (
 69        # scale z-scores between -1 and +1
 70        (((z_scores - z_scores.min()) / (z_scores.max() - z_scores.min())) - 0.5)
 71        * 2
 72    )
 73    q_middle = _calc_quantiles(middle)
 74
 75    if verbose:
 76        frame = DataFrame(
 77            {
 78                "count": original.count(),
 79                "mean": original.mean(),
 80                "median": original.median(),
 81                "min shaded": original.quantile(q=q_middle[0]),
 82                "max shaded": original.quantile(q=q_middle[1]),
 83                "z-scores": z_scores.iloc[-1],
 84                "scaled": z_scaled.iloc[-1],
 85            }
 86        )
 87        print(frame)
 88
 89    return DataFrame(z_scores), DataFrame(z_scaled)  # syntactic sugar for type hinting
 90
 91
 92def _plot_middle_bars(
 93    adjusted: DataFrame,
 94    middle: float,
 95    kwargs: dict[str, Any],  # must be a dictionary, not a splat
 96) -> Axes:
 97    """Plot the middle (typically 80%) of the data as a bar.
 98    Note: also sets the x-axis limits in kwargs.
 99    Return the matplotlib Axes object."""
100
101    q = _calc_quantiles(middle)
102    lo_hi: DataFrame = adjusted.quantile(q=q).T  # get the middle section of data
103    span = 1.15
104    space = 0.2
105    low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -span) - space
106    high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), span) + space
107    kwargs["xlim"] = (low, high)  # remember the x-axis limits
108    _fig, ax = subplots()
109    ax.barh(
110        y=lo_hi.index,
111        width=lo_hi[q[1]] - lo_hi[q[0]],
112        left=lo_hi[q[0]],
113        color="#bbbbbb",
114        label=f"Middle {middle*100:0.0f}% of prints",
115    )
116    return ax
117
118
119def _plot_latest_datapoint(
120    ax: Axes,
121    original: DataFrame,
122    adjusted: DataFrame,
123    f_size: int,
124) -> None:
125    """Add the latest datapoints to the summary plot"""
126
127    ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest")
128    f_size = 10
129    row = adjusted.index[-1]
130    for col_num, col_name in enumerate(original.columns):
131        ax.text(
132            x=adjusted.at[row, col_name],
133            y=col_num,
134            s=f"{original.at[row, col_name]:.1f}",
135            ha="center",
136            va="center",
137            size=f_size,
138        )
139
140
141def _label_extremes(
142    ax: Axes,
143    data: tuple[DataFrame, DataFrame],
144    plot_type: str,
145    f_size: int,
146    kwargs: dict[str, Any],  # must be a dictionary, not a splat
147) -> None:
148    """Label the extremes in the scaled plots."""
149
150    original, adjusted = data
151    low, high = kwargs["xlim"]
152    if plot_type == ZSCALED:
153        ax.axvline(-1, color="#555555", linewidth=0.5, linestyle="--")
154        ax.axvline(1, color="#555555", linewidth=0.5, linestyle="--")
155        ax.scatter(
156            adjusted.median(),
157            adjusted.columns,
158            color="darkorchid",
159            marker="x",
160            s=5,
161            label="Median",
162        )
163        for col_num, col_name in enumerate(original.columns):
164            ax.text(
165                low,
166                col_num,
167                f" {original[col_name].min():.1f}",
168                ha="left",
169                va="center",
170                size=f_size,
171            )
172            ax.text(
173                high,
174                col_num,
175                f"{original[col_name].max():.1f} ",
176                ha="right",
177                va="center",
178                size=f_size,
179            )
180
181
182def _horizontal_bar_plot(
183    original: DataFrame,
184    adjusted: DataFrame,
185    middle: float,
186    plot_type: str,
187    kwargs: dict[str, Any],  # must be a dictionary, not a splat
188) -> Axes:
189    """Plot horizontal bars for the middle of the data."""
190
191    # kwargs is a dictionary, not a splat
192    # so that we can pass it to the Axes object and
193    # set the x-axis limits.
194
195    ax = _plot_middle_bars(adjusted, middle, kwargs)
196    f_size = 10
197    _plot_latest_datapoint(ax, original, adjusted, f_size)
198    _label_extremes(
199        ax, data=(original, adjusted), plot_type=plot_type, f_size=f_size, kwargs=kwargs
200    )
201
202    return ax
203
204
205# public
206def summary_plot(
207    data: DataT,  # summary data
208    **kwargs,
209) -> Axes:
210    """Plot a summary of historical data for a given DataFrame.
211
212    Args:
213    - summary: DataFrame containing the summary data. The column names are
214      used as labels for the plot.
215    - kwargs: additional arguments for the plot, including:
216        - plot_from: int | Period | None
217        - verbose: if True, print the summary data.
218        - middle: proportion of data to highlight (default is 0.8).
219        - plot_types: list of plot types to generate.
220
221
222    Returns Axes.
223    """
224
225    # --- check the kwargs
226    me = "summary_plot"
227    report_kwargs(called_from=me, **kwargs)
228    kwargs = validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs)
229
230    # --- check the data
231    data = check_clean_timeseries(data, me)
232    if not isinstance(data, DataFrame):
233        raise TypeError("data must be a pandas DataFrame for summary_plot()")
234    df = DataFrame(data)  # syntactic sugar for type hinting
235
236    # --- optional arguments
237    verbose = kwargs.pop("verbose", False)
238    middle = float(kwargs.pop("middle", 0.8))
239    plot_type = kwargs.pop("plot_type", ZSCORES)
240    kwargs["legend"] = kwargs.get(
241        "legend",
242        {
243            # put the legend below the x-axis label
244            "loc": "upper center",
245            "fontsize": "xx-small",
246            "bbox_to_anchor": (0.5, -0.125),
247            "ncol": 4,
248        },
249    )
250
251    # get the data, calculate z-scores and scaled scores based on the start period
252    subset, kwargs = constrain_data(df, **kwargs)
253    z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose)
254
255    # plot as required by the plot_types argument
256    adjusted = z_scores if plot_type == ZSCORES else z_scaled
257    ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs)
258    ax.tick_params(axis="y", labelsize="small")
259    make_legend(ax, kwargs["legend"])
260    ax.set_xlim(kwargs.get("xlim", None))  # provide space for the labels
261
262    return ax
ZSCORES = 'zscores'
ZSCALED = 'zscaled'
SUMMARY_KW_TYPES: mgplot.kw_type_checking.ExpectedTypeDict = {'ax': (<class 'matplotlib.axes._axes.Axes'>, <class 'NoneType'>), 'verbose': <class 'bool'>, 'middle': <class 'float'>, 'plot_type': <class 'str'>, 'plot_from': (<class 'int'>, <class 'pandas._libs.tslibs.period.Period'>, <class 'NoneType'>)}
def summary_plot(data: ~DataT, **kwargs) -> matplotlib.axes._axes.Axes:
207def summary_plot(
208    data: DataT,  # summary data
209    **kwargs,
210) -> Axes:
211    """Plot a summary of historical data for a given DataFrame.
212
213    Args:
214    - summary: DataFrame containing the summary data. The column names are
215      used as labels for the plot.
216    - kwargs: additional arguments for the plot, including:
217        - plot_from: int | Period | None
218        - verbose: if True, print the summary data.
219        - middle: proportion of data to highlight (default is 0.8).
220        - plot_types: list of plot types to generate.
221
222
223    Returns Axes.
224    """
225
226    # --- check the kwargs
227    me = "summary_plot"
228    report_kwargs(called_from=me, **kwargs)
229    kwargs = validate_kwargs(SUMMARY_KW_TYPES, me, **kwargs)
230
231    # --- check the data
232    data = check_clean_timeseries(data, me)
233    if not isinstance(data, DataFrame):
234        raise TypeError("data must be a pandas DataFrame for summary_plot()")
235    df = DataFrame(data)  # syntactic sugar for type hinting
236
237    # --- optional arguments
238    verbose = kwargs.pop("verbose", False)
239    middle = float(kwargs.pop("middle", 0.8))
240    plot_type = kwargs.pop("plot_type", ZSCORES)
241    kwargs["legend"] = kwargs.get(
242        "legend",
243        {
244            # put the legend below the x-axis label
245            "loc": "upper center",
246            "fontsize": "xx-small",
247            "bbox_to_anchor": (0.5, -0.125),
248            "ncol": 4,
249        },
250    )
251
252    # get the data, calculate z-scores and scaled scores based on the start period
253    subset, kwargs = constrain_data(df, **kwargs)
254    z_scores, z_scaled = _calculate_z(subset, middle, verbose=verbose)
255
256    # plot as required by the plot_types argument
257    adjusted = z_scores if plot_type == ZSCORES else z_scaled
258    ax = _horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargs)
259    ax.tick_params(axis="y", labelsize="small")
260    make_legend(ax, kwargs["legend"])
261    ax.set_xlim(kwargs.get("xlim", None))  # provide space for the labels
262
263    return ax

Plot a summary of historical data for a given DataFrame.

Args:

  • summary: DataFrame containing the summary data. The column names are used as labels for the plot.
  • kwargs: additional arguments for the plot, including:
    • plot_from: int | Period | None
    • verbose: if True, print the summary data.
    • middle: proportion of data to highlight (default is 0.8).
    • plot_types: list of plot types to generate.

Returns Axes.