mgplot.summary_plot
Produce a summary plot for the data in a given DataFrame.
1"""Produce a summary plot for the data in a given DataFrame.""" 2 3# system imports 4from typing import Any, NotRequired, Unpack 5 6from matplotlib.axes import Axes 7 8# analytic third-party imports 9from numpy import array, ndarray 10from pandas import DataFrame, Period 11 12from mgplot.finalise_plot import make_legend 13from mgplot.keyword_checking import ( 14 BaseKwargs, 15 report_kwargs, 16 validate_kwargs, 17) 18 19# local imports 20from mgplot.settings import DataT 21from mgplot.utilities import check_clean_timeseries, constrain_data, get_axes, label_period 22 23# --- constants 24ME = "summary_plot" 25ZSCORES = "zscores" 26ZSCALED = "zscaled" 27 28# Plot layout constants 29SPAN_LIMIT = 1.15 30SPACE_MARGIN = 0.2 31DEFAULT_FONT_SIZE = 10 32SMALL_FONT_SIZE = "x-small" 33SMALL_MARKER_SIZE = 5 34REFERENCE_LINE_WIDTH = 0.5 35DEFAULT_MIDDLE = 0.8 36DEFAULT_PLOT_FROM = 0 37HIGH_PRECISION_THRESHOLD = 1 38 39 40class SummaryKwargs(BaseKwargs): 41 """Keyword arguments for the summary_plot function.""" 42 43 ax: NotRequired[Axes | None] 44 verbose: NotRequired[bool] 45 middle: NotRequired[float] 46 plot_type: NotRequired[str] 47 plot_from: NotRequired[int | Period] 48 legend: NotRequired[bool | dict[str, Any] | None] 49 xlabel: NotRequired[str | None] 50 51 52# --- functions 53def calc_quantiles(middle: float) -> ndarray: 54 """Calculate the quantiles for the middle of the data.""" 55 return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0]) 56 57 58def calculate_z( 59 original: DataFrame, 60 middle: float, 61 *, 62 verbose: bool = False, 63) -> tuple[DataFrame, DataFrame]: 64 """Calculate z-scores, scaled z-scores and middle quantiles. 65 66 Args: 67 original: DataFrame containing the original data. 68 middle: float, the proportion of data to highlight in the middle (eg. 0.8 for 80%). 69 verbose: bool, whether to print the summary data. 70 71 Returns: 72 tuple[DataFrame, DataFrame]: z_scores and z_scaled DataFrames. 73 74 Raises: 75 ValueError: If original DataFrame is empty or has zero variance. 76 77 """ 78 if original.empty: 79 raise ValueError("Cannot calculate z-scores for empty DataFrame") 80 81 # Check for zero variance 82 std_dev = original.std() 83 if (std_dev == 0).any(): 84 raise ValueError("Cannot calculate z-scores when standard deviation is zero") 85 86 # Calculate z-scores 87 z_scores: DataFrame = (original - original.mean()) / std_dev 88 89 # Scale z-scores between -1 and +1 90 z_min = z_scores.min() 91 z_max = z_scores.max() 92 z_range = z_max - z_min 93 94 # Avoid division by zero in scaling 95 if (z_range == 0).any(): 96 z_scaled: DataFrame = z_scores.copy() * 0 # All zeros if no variance 97 else: 98 z_scaled = (((z_scores - z_min) / z_range) - 0.5) * 2 99 100 if verbose: 101 if original.index.empty: 102 raise ValueError("Cannot display statistics for empty DataFrame") 103 104 q_middle = calc_quantiles(middle) 105 frame = DataFrame( 106 { 107 "count": original.count(), 108 "mean": original.mean(), 109 "median": original.median(), 110 "min shaded": original.quantile(q=q_middle[0]), 111 "max shaded": original.quantile(q=q_middle[1]), 112 "z-scores": z_scores.iloc[-1], 113 "scaled": z_scaled.iloc[-1], 114 }, 115 ) 116 print(frame) 117 118 return z_scores, z_scaled 119 120 121def plot_middle_bars( 122 adjusted: DataFrame, 123 middle: float, 124 kwargs: dict[str, Any], 125) -> Axes: 126 """Plot the middle (typically 80%) of the data as a bar.""" 127 if adjusted.empty: 128 raise ValueError("Cannot plot bars for empty DataFrame") 129 130 q = calc_quantiles(middle) 131 lo_hi: DataFrame = adjusted.quantile(q=q).T # get the middle section of data 132 133 low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -SPAN_LIMIT) - SPACE_MARGIN 134 high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), SPAN_LIMIT) + SPACE_MARGIN 135 kwargs["xlim"] = (low, high) # update the kwargs with the xlim 136 ax, _ = get_axes(**kwargs) 137 ax.barh( 138 y=lo_hi.index, 139 width=lo_hi[q[1]] - lo_hi[q[0]], 140 left=lo_hi[q[0]], 141 color="#bbbbbb", 142 label=f"Middle {middle * 100:0.0f}% of prints", 143 ) 144 return ax 145 146 147def plot_latest_datapoint( 148 ax: Axes, 149 original: DataFrame, 150 adjusted: DataFrame, 151 font_size: int | str, 152) -> None: 153 """Add the latest datapoints to the summary plot.""" 154 if adjusted.empty or original.empty: 155 raise ValueError("Cannot plot datapoints for empty DataFrame") 156 157 ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest") 158 row = adjusted.index[-1] 159 for col_num, col_name in enumerate(original.columns): 160 x_adj = float(adjusted.at[row, col_name]) 161 x_orig = float(original.at[row, col_name]) 162 precision = 2 if abs(x_orig) < HIGH_PRECISION_THRESHOLD else 1 163 ax.text( 164 x=x_adj, 165 y=col_num, 166 s=f"{x_orig:.{precision}f}", 167 ha="center", 168 va="center", 169 size=font_size, 170 ) 171 172 173def label_extremes( 174 ax: Axes, 175 data: tuple[DataFrame, DataFrame], 176 plot_type: str, 177 font_size: int | str, 178 kwargs: dict[str, Any], # must be a dictionary, not a splat 179) -> None: 180 """Label the extremes in the scaled plots.""" 181 original, adjusted = data 182 low, high = kwargs["xlim"] 183 ax.set_xlim(low, high) # set the x-axis limits 184 if plot_type == ZSCALED: 185 ax.scatter( 186 adjusted.median(), 187 adjusted.columns, 188 color="darkorchid", 189 marker="x", 190 s=SMALL_MARKER_SIZE, 191 label="Median", 192 ) 193 for col_num, col_name in enumerate(original.columns): 194 minima, maxima = original[col_name].min(), original[col_name].max() 195 min_precision = 2 if abs(minima) < HIGH_PRECISION_THRESHOLD else 1 196 max_precision = 2 if abs(maxima) < HIGH_PRECISION_THRESHOLD else 1 197 ax.text( 198 low, 199 col_num, 200 f" {minima:.{min_precision}f}", 201 ha="left", 202 va="center", 203 size=font_size, 204 ) 205 ax.text( 206 high, 207 col_num, 208 f"{maxima:.{max_precision}f} ", 209 ha="right", 210 va="center", 211 size=font_size, 212 ) 213 214 215def horizontal_bar_plot( 216 original: DataFrame, 217 adjusted: DataFrame, 218 middle: float, 219 plot_type: str, 220 kwargs: dict[str, Any], # must be a dictionary, not a splat 221) -> Axes: 222 """Plot horizontal bars for the middle of the data.""" 223 ax = plot_middle_bars(adjusted, middle, kwargs) 224 font_size = SMALL_FONT_SIZE 225 plot_latest_datapoint(ax, original, adjusted, font_size) 226 label_extremes(ax, data=(original, adjusted), plot_type=plot_type, font_size=font_size, kwargs=kwargs) 227 228 return ax 229 230 231def label_x_axis(plot_from: int | Period, label: str | None, plot_type: str, ax: Axes, df: DataFrame) -> None: 232 """Label the x-axis for the plot.""" 233 start: Period = plot_from if isinstance(plot_from, Period) else df.index[plot_from] 234 if label is not None: 235 if not label: 236 if plot_type == ZSCORES: 237 label = f"Z-scores for prints since {label_period(start)}" 238 else: 239 label = f"-1 to 1 scaled z-scores since {label_period(start)}" 240 ax.set_xlabel(label) 241 242 243def mark_reference_lines(plot_type: str, ax: Axes) -> None: 244 """Mark the reference lines for the plot.""" 245 line_color = "#555555" 246 line_style = "--" 247 248 if plot_type == ZSCALED: 249 ax.axvline(-1, color=line_color, linewidth=REFERENCE_LINE_WIDTH, linestyle=line_style, label="-1") 250 ax.axvline(1, color=line_color, linewidth=REFERENCE_LINE_WIDTH, linestyle=line_style, label="+1") 251 elif plot_type == ZSCORES: 252 ax.axvline(0, color=line_color, linewidth=REFERENCE_LINE_WIDTH, linestyle=line_style, label="0") 253 254 255def plot_the_data(df: DataFrame, **kwargs: Unpack[SummaryKwargs]) -> tuple[Axes, str]: 256 """Plot the data as a summary plot. 257 258 Args: 259 df: DataFrame - the data to plot. 260 kwargs: SummaryKwargs, additional keyword arguments for the plot. 261 262 Returns: 263 tuple[Axes, str]: A tuple comprising the Axes object and plot type ('zscores' or 'zscaled'). 264 265 Raises: 266 ValueError: If middle value is not between 0 and 1, or if plot_type is invalid. 267 268 """ 269 verbose = kwargs.pop("verbose", False) 270 middle = float(kwargs.pop("middle", DEFAULT_MIDDLE)) 271 plot_type = kwargs.pop("plot_type", ZSCORES) 272 273 # Validate inputs 274 if not 0 < middle < 1: 275 raise ValueError(f"Middle value must be between 0 and 1, got {middle}") 276 if plot_type not in (ZSCORES, ZSCALED): 277 raise ValueError(f"plot_type must be '{ZSCORES}' or '{ZSCALED}', got '{plot_type}'") 278 279 subset, kwargsd = constrain_data(df, **kwargs) 280 z_scores, z_scaled = calculate_z(subset, middle, verbose=verbose) 281 282 # plot as required by the plot_types argument 283 adjusted = z_scores if plot_type == ZSCORES else z_scaled 284 ax = horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargsd) 285 ax.tick_params(axis="y", labelsize="small") 286 make_legend(ax, legend=kwargsd["legend"]) 287 ax.set_xlim(kwargsd.get("xlim")) # provide space for the labels 288 289 return ax, plot_type 290 291 292# --- public 293def summary_plot(data: DataT, **kwargs: Unpack[SummaryKwargs]) -> Axes: 294 """Plot a summary of historical data for a given DataFrame. 295 296 Args: 297 data: DataFrame containing the summary data. The column names are 298 used as labels for the plot. 299 kwargs: Additional arguments for the plot, including middle (float), 300 plot_type (str), verbose (bool), and standard plotting options. 301 302 Returns: 303 Axes: A matplotlib Axes object containing the summary plot. 304 305 Raises: 306 TypeError: If data is not a DataFrame. 307 308 """ 309 # --- check the kwargs 310 report_kwargs(caller=ME, **kwargs) 311 validate_kwargs(schema=SummaryKwargs, caller=ME, **kwargs) 312 313 # --- check the data 314 data = check_clean_timeseries(data, ME) 315 if not isinstance(data, DataFrame): 316 raise TypeError("data must be a pandas DataFrame for summary_plot()") 317 318 # --- legend 319 kwargs["legend"] = kwargs.get( 320 "legend", 321 { 322 # put the legend below the x-axis label 323 "loc": "upper center", 324 "fontsize": "xx-small", 325 "bbox_to_anchor": (0.5, -0.125), 326 "ncol": 4, 327 }, 328 ) 329 330 # --- and plot it ... 331 ax, plot_type = plot_the_data(data, **kwargs) 332 label_x_axis( 333 kwargs.get("plot_from", DEFAULT_PLOT_FROM), 334 label=kwargs.get("xlabel", ""), 335 plot_type=plot_type, 336 ax=ax, 337 df=data, 338 ) 339 mark_reference_lines(plot_type, ax) 340 341 return ax
41class SummaryKwargs(BaseKwargs): 42 """Keyword arguments for the summary_plot function.""" 43 44 ax: NotRequired[Axes | None] 45 verbose: NotRequired[bool] 46 middle: NotRequired[float] 47 plot_type: NotRequired[str] 48 plot_from: NotRequired[int | Period] 49 legend: NotRequired[bool | dict[str, Any] | None] 50 xlabel: NotRequired[str | None]
Keyword arguments for the summary_plot function.
54def calc_quantiles(middle: float) -> ndarray: 55 """Calculate the quantiles for the middle of the data.""" 56 return array([(1 - middle) / 2.0, 1 - (1 - middle) / 2.0])
Calculate the quantiles for the middle of the data.
59def calculate_z( 60 original: DataFrame, 61 middle: float, 62 *, 63 verbose: bool = False, 64) -> tuple[DataFrame, DataFrame]: 65 """Calculate z-scores, scaled z-scores and middle quantiles. 66 67 Args: 68 original: DataFrame containing the original data. 69 middle: float, the proportion of data to highlight in the middle (eg. 0.8 for 80%). 70 verbose: bool, whether to print the summary data. 71 72 Returns: 73 tuple[DataFrame, DataFrame]: z_scores and z_scaled DataFrames. 74 75 Raises: 76 ValueError: If original DataFrame is empty or has zero variance. 77 78 """ 79 if original.empty: 80 raise ValueError("Cannot calculate z-scores for empty DataFrame") 81 82 # Check for zero variance 83 std_dev = original.std() 84 if (std_dev == 0).any(): 85 raise ValueError("Cannot calculate z-scores when standard deviation is zero") 86 87 # Calculate z-scores 88 z_scores: DataFrame = (original - original.mean()) / std_dev 89 90 # Scale z-scores between -1 and +1 91 z_min = z_scores.min() 92 z_max = z_scores.max() 93 z_range = z_max - z_min 94 95 # Avoid division by zero in scaling 96 if (z_range == 0).any(): 97 z_scaled: DataFrame = z_scores.copy() * 0 # All zeros if no variance 98 else: 99 z_scaled = (((z_scores - z_min) / z_range) - 0.5) * 2 100 101 if verbose: 102 if original.index.empty: 103 raise ValueError("Cannot display statistics for empty DataFrame") 104 105 q_middle = calc_quantiles(middle) 106 frame = DataFrame( 107 { 108 "count": original.count(), 109 "mean": original.mean(), 110 "median": original.median(), 111 "min shaded": original.quantile(q=q_middle[0]), 112 "max shaded": original.quantile(q=q_middle[1]), 113 "z-scores": z_scores.iloc[-1], 114 "scaled": z_scaled.iloc[-1], 115 }, 116 ) 117 print(frame) 118 119 return z_scores, z_scaled
Calculate z-scores, scaled z-scores and middle quantiles.
Args: original: DataFrame containing the original data. middle: float, the proportion of data to highlight in the middle (eg. 0.8 for 80%). verbose: bool, whether to print the summary data.
Returns: tuple[DataFrame, DataFrame]: z_scores and z_scaled DataFrames.
Raises: ValueError: If original DataFrame is empty or has zero variance.
122def plot_middle_bars( 123 adjusted: DataFrame, 124 middle: float, 125 kwargs: dict[str, Any], 126) -> Axes: 127 """Plot the middle (typically 80%) of the data as a bar.""" 128 if adjusted.empty: 129 raise ValueError("Cannot plot bars for empty DataFrame") 130 131 q = calc_quantiles(middle) 132 lo_hi: DataFrame = adjusted.quantile(q=q).T # get the middle section of data 133 134 low = min(adjusted.iloc[-1].min(), lo_hi.min().min(), -SPAN_LIMIT) - SPACE_MARGIN 135 high = max(adjusted.iloc[-1].max(), lo_hi.max().max(), SPAN_LIMIT) + SPACE_MARGIN 136 kwargs["xlim"] = (low, high) # update the kwargs with the xlim 137 ax, _ = get_axes(**kwargs) 138 ax.barh( 139 y=lo_hi.index, 140 width=lo_hi[q[1]] - lo_hi[q[0]], 141 left=lo_hi[q[0]], 142 color="#bbbbbb", 143 label=f"Middle {middle * 100:0.0f}% of prints", 144 ) 145 return ax
Plot the middle (typically 80%) of the data as a bar.
148def plot_latest_datapoint( 149 ax: Axes, 150 original: DataFrame, 151 adjusted: DataFrame, 152 font_size: int | str, 153) -> None: 154 """Add the latest datapoints to the summary plot.""" 155 if adjusted.empty or original.empty: 156 raise ValueError("Cannot plot datapoints for empty DataFrame") 157 158 ax.scatter(adjusted.iloc[-1], adjusted.columns, color="darkorange", label="Latest") 159 row = adjusted.index[-1] 160 for col_num, col_name in enumerate(original.columns): 161 x_adj = float(adjusted.at[row, col_name]) 162 x_orig = float(original.at[row, col_name]) 163 precision = 2 if abs(x_orig) < HIGH_PRECISION_THRESHOLD else 1 164 ax.text( 165 x=x_adj, 166 y=col_num, 167 s=f"{x_orig:.{precision}f}", 168 ha="center", 169 va="center", 170 size=font_size, 171 )
Add the latest datapoints to the summary plot.
174def label_extremes( 175 ax: Axes, 176 data: tuple[DataFrame, DataFrame], 177 plot_type: str, 178 font_size: int | str, 179 kwargs: dict[str, Any], # must be a dictionary, not a splat 180) -> None: 181 """Label the extremes in the scaled plots.""" 182 original, adjusted = data 183 low, high = kwargs["xlim"] 184 ax.set_xlim(low, high) # set the x-axis limits 185 if plot_type == ZSCALED: 186 ax.scatter( 187 adjusted.median(), 188 adjusted.columns, 189 color="darkorchid", 190 marker="x", 191 s=SMALL_MARKER_SIZE, 192 label="Median", 193 ) 194 for col_num, col_name in enumerate(original.columns): 195 minima, maxima = original[col_name].min(), original[col_name].max() 196 min_precision = 2 if abs(minima) < HIGH_PRECISION_THRESHOLD else 1 197 max_precision = 2 if abs(maxima) < HIGH_PRECISION_THRESHOLD else 1 198 ax.text( 199 low, 200 col_num, 201 f" {minima:.{min_precision}f}", 202 ha="left", 203 va="center", 204 size=font_size, 205 ) 206 ax.text( 207 high, 208 col_num, 209 f"{maxima:.{max_precision}f} ", 210 ha="right", 211 va="center", 212 size=font_size, 213 )
Label the extremes in the scaled plots.
216def horizontal_bar_plot( 217 original: DataFrame, 218 adjusted: DataFrame, 219 middle: float, 220 plot_type: str, 221 kwargs: dict[str, Any], # must be a dictionary, not a splat 222) -> Axes: 223 """Plot horizontal bars for the middle of the data.""" 224 ax = plot_middle_bars(adjusted, middle, kwargs) 225 font_size = SMALL_FONT_SIZE 226 plot_latest_datapoint(ax, original, adjusted, font_size) 227 label_extremes(ax, data=(original, adjusted), plot_type=plot_type, font_size=font_size, kwargs=kwargs) 228 229 return ax
Plot horizontal bars for the middle of the data.
232def label_x_axis(plot_from: int | Period, label: str | None, plot_type: str, ax: Axes, df: DataFrame) -> None: 233 """Label the x-axis for the plot.""" 234 start: Period = plot_from if isinstance(plot_from, Period) else df.index[plot_from] 235 if label is not None: 236 if not label: 237 if plot_type == ZSCORES: 238 label = f"Z-scores for prints since {label_period(start)}" 239 else: 240 label = f"-1 to 1 scaled z-scores since {label_period(start)}" 241 ax.set_xlabel(label)
Label the x-axis for the plot.
244def mark_reference_lines(plot_type: str, ax: Axes) -> None: 245 """Mark the reference lines for the plot.""" 246 line_color = "#555555" 247 line_style = "--" 248 249 if plot_type == ZSCALED: 250 ax.axvline(-1, color=line_color, linewidth=REFERENCE_LINE_WIDTH, linestyle=line_style, label="-1") 251 ax.axvline(1, color=line_color, linewidth=REFERENCE_LINE_WIDTH, linestyle=line_style, label="+1") 252 elif plot_type == ZSCORES: 253 ax.axvline(0, color=line_color, linewidth=REFERENCE_LINE_WIDTH, linestyle=line_style, label="0")
Mark the reference lines for the plot.
256def plot_the_data(df: DataFrame, **kwargs: Unpack[SummaryKwargs]) -> tuple[Axes, str]: 257 """Plot the data as a summary plot. 258 259 Args: 260 df: DataFrame - the data to plot. 261 kwargs: SummaryKwargs, additional keyword arguments for the plot. 262 263 Returns: 264 tuple[Axes, str]: A tuple comprising the Axes object and plot type ('zscores' or 'zscaled'). 265 266 Raises: 267 ValueError: If middle value is not between 0 and 1, or if plot_type is invalid. 268 269 """ 270 verbose = kwargs.pop("verbose", False) 271 middle = float(kwargs.pop("middle", DEFAULT_MIDDLE)) 272 plot_type = kwargs.pop("plot_type", ZSCORES) 273 274 # Validate inputs 275 if not 0 < middle < 1: 276 raise ValueError(f"Middle value must be between 0 and 1, got {middle}") 277 if plot_type not in (ZSCORES, ZSCALED): 278 raise ValueError(f"plot_type must be '{ZSCORES}' or '{ZSCALED}', got '{plot_type}'") 279 280 subset, kwargsd = constrain_data(df, **kwargs) 281 z_scores, z_scaled = calculate_z(subset, middle, verbose=verbose) 282 283 # plot as required by the plot_types argument 284 adjusted = z_scores if plot_type == ZSCORES else z_scaled 285 ax = horizontal_bar_plot(subset, adjusted, middle, plot_type, kwargsd) 286 ax.tick_params(axis="y", labelsize="small") 287 make_legend(ax, legend=kwargsd["legend"]) 288 ax.set_xlim(kwargsd.get("xlim")) # provide space for the labels 289 290 return ax, plot_type
Plot the data as a summary plot.
Args: df: DataFrame - the data to plot. kwargs: SummaryKwargs, additional keyword arguments for the plot.
Returns: tuple[Axes, str]: A tuple comprising the Axes object and plot type ('zscores' or 'zscaled').
Raises: ValueError: If middle value is not between 0 and 1, or if plot_type is invalid.
294def summary_plot(data: DataT, **kwargs: Unpack[SummaryKwargs]) -> Axes: 295 """Plot a summary of historical data for a given DataFrame. 296 297 Args: 298 data: DataFrame containing the summary data. The column names are 299 used as labels for the plot. 300 kwargs: Additional arguments for the plot, including middle (float), 301 plot_type (str), verbose (bool), and standard plotting options. 302 303 Returns: 304 Axes: A matplotlib Axes object containing the summary plot. 305 306 Raises: 307 TypeError: If data is not a DataFrame. 308 309 """ 310 # --- check the kwargs 311 report_kwargs(caller=ME, **kwargs) 312 validate_kwargs(schema=SummaryKwargs, caller=ME, **kwargs) 313 314 # --- check the data 315 data = check_clean_timeseries(data, ME) 316 if not isinstance(data, DataFrame): 317 raise TypeError("data must be a pandas DataFrame for summary_plot()") 318 319 # --- legend 320 kwargs["legend"] = kwargs.get( 321 "legend", 322 { 323 # put the legend below the x-axis label 324 "loc": "upper center", 325 "fontsize": "xx-small", 326 "bbox_to_anchor": (0.5, -0.125), 327 "ncol": 4, 328 }, 329 ) 330 331 # --- and plot it ... 332 ax, plot_type = plot_the_data(data, **kwargs) 333 label_x_axis( 334 kwargs.get("plot_from", DEFAULT_PLOT_FROM), 335 label=kwargs.get("xlabel", ""), 336 plot_type=plot_type, 337 ax=ax, 338 df=data, 339 ) 340 mark_reference_lines(plot_type, ax) 341 342 return ax
Plot a summary of historical data for a given DataFrame.
Args: data: DataFrame containing the summary data. The column names are used as labels for the plot. kwargs: Additional arguments for the plot, including middle (float), plot_type (str), verbose (bool), and standard plotting options.
Returns: Axes: A matplotlib Axes object containing the summary plot.
Raises: TypeError: If data is not a DataFrame.