mgplot.postcovid_plot

Plot the pre-COVID trajectory against the current trend.

  1"""Plot the pre-COVID trajectory against the current trend."""
  2
  3from typing import NotRequired, Unpack, cast
  4
  5from matplotlib.axes import Axes
  6from numpy import array, polyfit
  7from pandas import DataFrame, Period, PeriodIndex, Series, period_range
  8
  9from mgplot.keyword_checking import (
 10    report_kwargs,
 11    validate_kwargs,
 12)
 13from mgplot.line_plot import LineKwargs, line_plot
 14from mgplot.settings import DataT, get_setting
 15from mgplot.utilities import check_clean_timeseries
 16
 17# --- constants
 18ME = "postcovid_plot"
 19MIN_REGRESSION_POINTS = 10  # minimum number of points before making a regression
 20
 21# Default regression periods by frequency
 22DEFAULT_PERIODS = {
 23    "Q": {"start": "2014Q4", "end": "2019Q4"},
 24    "M": {"start": "2015-01", "end": "2020-01"},
 25    "D": {"start": "2015-01-01", "end": "2020-01-01"},
 26}
 27
 28
 29class PostcovidKwargs(LineKwargs):
 30    """Keyword arguments for the post-COVID plot."""
 31
 32    start_r: NotRequired[Period]  # start of regression period
 33    end_r: NotRequired[Period]  # end of regression period
 34
 35
 36# --- functions
 37def get_projection(original: Series, to_period: Period) -> Series:
 38    """Create a linear projection based on pre-COVID data.
 39
 40    Assumes the start of the data has been trimmed to the period before COVID.
 41
 42    Args:
 43        original: Series - the original series with a PeriodIndex
 44            Assume the index is a PeriodIndex, that is unique and monotonic increasing.
 45        to_period: Period - the period to which the projection should extend.
 46
 47    Returns:
 48        Series: A pandas Series with linear projection values using the same index as original.
 49
 50    Raises:
 51        ValueError: If to_period is not within the original series index range.
 52
 53    """
 54    # --- using ordinals to manage gaps during the regression period (eg in Job Vacancy data)
 55    op_index = cast("PeriodIndex", original.index)
 56    y_regress = original[original.index <= to_period].to_numpy()
 57    x_regress = array([p.ordinal for p in op_index if p <= to_period])
 58    x_complete = array([p.ordinal for p in op_index])
 59    m, b = polyfit(x_regress, y_regress, 1)
 60    regression = Series((x_complete * m) + b, index=original.index)
 61    regression = regression.reindex(period_range(start=op_index[0], end=op_index[-1])).interpolate(
 62        method="linear"
 63    )
 64    regression.index.name = original.index.name
 65    return regression
 66
 67
 68def regression_period(data: Series, **kwargs: Unpack[PostcovidKwargs]) -> tuple[Period, Period, bool]:
 69    """Establish the regression period.
 70
 71    Args:
 72        data: Series - the original time series data.
 73        **kwargs: Additional keyword arguments.
 74
 75    Returns:
 76        A tuple containing the start and end periods for regression,
 77        and a boolean indicating if the period is robust.
 78
 79    """
 80    # --- check that the series index is a PeriodIndex with a valid frequency
 81    series_index = PeriodIndex(data.index)
 82    freq_str = series_index.freqstr
 83    freq_key = freq_str[0]
 84    if not freq_str or freq_key not in ("Q", "M", "D"):
 85        raise ValueError("The series index must have a D, M or Q frequency")
 86
 87    # --- set the default regression period
 88    default_periods = DEFAULT_PERIODS[freq_key]
 89    start_regression = Period(default_periods["start"], freq=freq_str)
 90    end_regression = Period(default_periods["end"], freq=freq_str)
 91
 92    # --- Override defaults with user-provided periods if specified
 93    user_start = kwargs.pop("start_r", None)
 94    user_end = kwargs.pop("end_r", None)
 95
 96    start_r = Period(user_start, freq=freq_str) if user_start else start_regression
 97    end_r = Period(user_end, freq=freq_str) if user_end else end_regression
 98
 99    # --- Validate the regression period
100    robust = True
101    if start_r >= end_r:
102        print(f"Invalid regression period: {start_r=}, {end_r=}")
103        robust = False
104    no_nan_series = data.dropna()
105    if (
106        number := len(no_nan_series[(no_nan_series.index >= start_r) & (no_nan_series.index <= end_r)])
107    ) < MIN_REGRESSION_POINTS:
108        print(f"Insufficient data points (n={number}) for regression.")
109        robust = False
110
111    return start_r, end_r, robust
112
113
114def postcovid_plot(data: DataT, **kwargs: Unpack[PostcovidKwargs]) -> Axes:
115    """Plot a series with a PeriodIndex, including a post-COVID projection.
116
117    Args:
118        data: Series - the series to be plotted.
119        kwargs: PostcovidKwargs - plotting arguments.
120
121    Raises:
122        TypeError if series is not a pandas Series
123        TypeError if series does not have a PeriodIndex
124        ValueError if series does not have a D, M or Q frequency
125        ValueError if regression start is after regression end
126
127    """
128    # --- check the kwargs
129    report_kwargs(caller=ME, **kwargs)
130    validate_kwargs(schema=PostcovidKwargs, caller=ME, **kwargs)
131
132    # --- check the data
133    data = check_clean_timeseries(data, ME)
134    if not isinstance(data, Series):
135        raise TypeError("The series argument must be a pandas Series")
136
137    # rely on line_plot() to validate kwargs, but remove any that are not relevant
138    if "plot_from" in kwargs:
139        print("Warning: the 'plot_from' argument is ignored in postcovid_plot().")
140        del kwargs["plot_from"]
141
142    # --- set the regression period
143    start_r, end_r, robust = regression_period(data, **kwargs)
144    kwargs.pop("start_r", None)  # remove from kwargs to avoid confusion
145    kwargs.pop("end_r", None)  # remove from kwargs to avoid confusion
146    if not robust:
147        print("No valid regression period found; plotting raw data only.")
148        return line_plot(
149            data,
150            **cast("LineKwargs", kwargs),
151        )
152
153    # --- combine data and projection
154    if start_r < data.dropna().index.min():
155        print(f"Caution: Regression start period pre-dates the series index: {start_r=}")
156    recent_data = data[data.index >= start_r].copy()
157    recent_data.name = "Series"
158    projection_data = get_projection(recent_data, end_r)
159    projection_data.name = "Pre-COVID projection"
160
161    # --- Create DataFrame with proper column alignment
162    combined_data = DataFrame(
163        {
164            projection_data.name: projection_data,
165            recent_data.name: recent_data,
166        }
167    )
168
169    # --- activate plot settings
170    kwargs["width"] = kwargs.pop(
171        "width",
172        (get_setting("line_normal"), get_setting("line_wide")),
173    )  # series line is thicker than projection
174    kwargs["style"] = kwargs.pop("style", ("--", "-"))  # dashed regression line
175    kwargs["label_series"] = kwargs.pop("label_series", True)
176    kwargs["annotate"] = kwargs.pop("annotate", (False, True))  # annotate series only
177    kwargs["color"] = kwargs.pop("color", ("darkblue", "#dd0000"))
178    kwargs["dropna"] = kwargs.pop("dropna", False)  # drop NaN values
179
180    return line_plot(
181        combined_data,
182        **cast("LineKwargs", kwargs),
183    )
ME = 'postcovid_plot'
MIN_REGRESSION_POINTS = 10
DEFAULT_PERIODS = {'Q': {'start': '2014Q4', 'end': '2019Q4'}, 'M': {'start': '2015-01', 'end': '2020-01'}, 'D': {'start': '2015-01-01', 'end': '2020-01-01'}}
class PostcovidKwargs(mgplot.line_plot.LineKwargs):
30class PostcovidKwargs(LineKwargs):
31    """Keyword arguments for the post-COVID plot."""
32
33    start_r: NotRequired[Period]  # start of regression period
34    end_r: NotRequired[Period]  # end of regression period

Keyword arguments for the post-COVID plot.

start_r: NotRequired[pandas._libs.tslibs.period.Period]
end_r: NotRequired[pandas._libs.tslibs.period.Period]
def get_projection( original: pandas.core.series.Series, to_period: pandas._libs.tslibs.period.Period) -> pandas.core.series.Series:
38def get_projection(original: Series, to_period: Period) -> Series:
39    """Create a linear projection based on pre-COVID data.
40
41    Assumes the start of the data has been trimmed to the period before COVID.
42
43    Args:
44        original: Series - the original series with a PeriodIndex
45            Assume the index is a PeriodIndex, that is unique and monotonic increasing.
46        to_period: Period - the period to which the projection should extend.
47
48    Returns:
49        Series: A pandas Series with linear projection values using the same index as original.
50
51    Raises:
52        ValueError: If to_period is not within the original series index range.
53
54    """
55    # --- using ordinals to manage gaps during the regression period (eg in Job Vacancy data)
56    op_index = cast("PeriodIndex", original.index)
57    y_regress = original[original.index <= to_period].to_numpy()
58    x_regress = array([p.ordinal for p in op_index if p <= to_period])
59    x_complete = array([p.ordinal for p in op_index])
60    m, b = polyfit(x_regress, y_regress, 1)
61    regression = Series((x_complete * m) + b, index=original.index)
62    regression = regression.reindex(period_range(start=op_index[0], end=op_index[-1])).interpolate(
63        method="linear"
64    )
65    regression.index.name = original.index.name
66    return regression

Create a linear projection based on pre-COVID data.

Assumes the start of the data has been trimmed to the period before COVID.

Args: original: Series - the original series with a PeriodIndex Assume the index is a PeriodIndex, that is unique and monotonic increasing. to_period: Period - the period to which the projection should extend.

Returns: Series: A pandas Series with linear projection values using the same index as original.

Raises: ValueError: If to_period is not within the original series index range.

def regression_period( data: pandas.core.series.Series, **kwargs: Unpack[PostcovidKwargs]) -> tuple[pandas._libs.tslibs.period.Period, pandas._libs.tslibs.period.Period, bool]:
 69def regression_period(data: Series, **kwargs: Unpack[PostcovidKwargs]) -> tuple[Period, Period, bool]:
 70    """Establish the regression period.
 71
 72    Args:
 73        data: Series - the original time series data.
 74        **kwargs: Additional keyword arguments.
 75
 76    Returns:
 77        A tuple containing the start and end periods for regression,
 78        and a boolean indicating if the period is robust.
 79
 80    """
 81    # --- check that the series index is a PeriodIndex with a valid frequency
 82    series_index = PeriodIndex(data.index)
 83    freq_str = series_index.freqstr
 84    freq_key = freq_str[0]
 85    if not freq_str or freq_key not in ("Q", "M", "D"):
 86        raise ValueError("The series index must have a D, M or Q frequency")
 87
 88    # --- set the default regression period
 89    default_periods = DEFAULT_PERIODS[freq_key]
 90    start_regression = Period(default_periods["start"], freq=freq_str)
 91    end_regression = Period(default_periods["end"], freq=freq_str)
 92
 93    # --- Override defaults with user-provided periods if specified
 94    user_start = kwargs.pop("start_r", None)
 95    user_end = kwargs.pop("end_r", None)
 96
 97    start_r = Period(user_start, freq=freq_str) if user_start else start_regression
 98    end_r = Period(user_end, freq=freq_str) if user_end else end_regression
 99
100    # --- Validate the regression period
101    robust = True
102    if start_r >= end_r:
103        print(f"Invalid regression period: {start_r=}, {end_r=}")
104        robust = False
105    no_nan_series = data.dropna()
106    if (
107        number := len(no_nan_series[(no_nan_series.index >= start_r) & (no_nan_series.index <= end_r)])
108    ) < MIN_REGRESSION_POINTS:
109        print(f"Insufficient data points (n={number}) for regression.")
110        robust = False
111
112    return start_r, end_r, robust

Establish the regression period.

Args: data: Series - the original time series data. **kwargs: Additional keyword arguments.

Returns: A tuple containing the start and end periods for regression, and a boolean indicating if the period is robust.

def postcovid_plot( data: ~DataT, **kwargs: Unpack[PostcovidKwargs]) -> matplotlib.axes._axes.Axes:
115def postcovid_plot(data: DataT, **kwargs: Unpack[PostcovidKwargs]) -> Axes:
116    """Plot a series with a PeriodIndex, including a post-COVID projection.
117
118    Args:
119        data: Series - the series to be plotted.
120        kwargs: PostcovidKwargs - plotting arguments.
121
122    Raises:
123        TypeError if series is not a pandas Series
124        TypeError if series does not have a PeriodIndex
125        ValueError if series does not have a D, M or Q frequency
126        ValueError if regression start is after regression end
127
128    """
129    # --- check the kwargs
130    report_kwargs(caller=ME, **kwargs)
131    validate_kwargs(schema=PostcovidKwargs, caller=ME, **kwargs)
132
133    # --- check the data
134    data = check_clean_timeseries(data, ME)
135    if not isinstance(data, Series):
136        raise TypeError("The series argument must be a pandas Series")
137
138    # rely on line_plot() to validate kwargs, but remove any that are not relevant
139    if "plot_from" in kwargs:
140        print("Warning: the 'plot_from' argument is ignored in postcovid_plot().")
141        del kwargs["plot_from"]
142
143    # --- set the regression period
144    start_r, end_r, robust = regression_period(data, **kwargs)
145    kwargs.pop("start_r", None)  # remove from kwargs to avoid confusion
146    kwargs.pop("end_r", None)  # remove from kwargs to avoid confusion
147    if not robust:
148        print("No valid regression period found; plotting raw data only.")
149        return line_plot(
150            data,
151            **cast("LineKwargs", kwargs),
152        )
153
154    # --- combine data and projection
155    if start_r < data.dropna().index.min():
156        print(f"Caution: Regression start period pre-dates the series index: {start_r=}")
157    recent_data = data[data.index >= start_r].copy()
158    recent_data.name = "Series"
159    projection_data = get_projection(recent_data, end_r)
160    projection_data.name = "Pre-COVID projection"
161
162    # --- Create DataFrame with proper column alignment
163    combined_data = DataFrame(
164        {
165            projection_data.name: projection_data,
166            recent_data.name: recent_data,
167        }
168    )
169
170    # --- activate plot settings
171    kwargs["width"] = kwargs.pop(
172        "width",
173        (get_setting("line_normal"), get_setting("line_wide")),
174    )  # series line is thicker than projection
175    kwargs["style"] = kwargs.pop("style", ("--", "-"))  # dashed regression line
176    kwargs["label_series"] = kwargs.pop("label_series", True)
177    kwargs["annotate"] = kwargs.pop("annotate", (False, True))  # annotate series only
178    kwargs["color"] = kwargs.pop("color", ("darkblue", "#dd0000"))
179    kwargs["dropna"] = kwargs.pop("dropna", False)  # drop NaN values
180
181    return line_plot(
182        combined_data,
183        **cast("LineKwargs", kwargs),
184    )

Plot a series with a PeriodIndex, including a post-COVID projection.

Args: data: Series - the series to be plotted. kwargs: PostcovidKwargs - plotting arguments.

Raises: TypeError if series is not a pandas Series TypeError if series does not have a PeriodIndex ValueError if series does not have a D, M or Q frequency ValueError if regression start is after regression end