mgplot.postcovid_plot
Plot the pre-COVID trajectory against the current trend.
1"""Plot the pre-COVID trajectory against the current trend.""" 2 3from typing import NotRequired, Unpack, cast 4 5from matplotlib.axes import Axes 6from numpy import array, polyfit 7from pandas import DataFrame, Period, PeriodIndex, Series, period_range 8 9from mgplot.keyword_checking import ( 10 report_kwargs, 11 validate_kwargs, 12) 13from mgplot.line_plot import LineKwargs, line_plot 14from mgplot.settings import DataT, get_setting 15from mgplot.utilities import check_clean_timeseries 16 17# --- constants 18ME = "postcovid_plot" 19MIN_REGRESSION_POINTS = 10 # minimum number of points before making a regression 20 21# Default regression periods by frequency 22DEFAULT_PERIODS = { 23 "Q": {"start": "2014Q4", "end": "2019Q4"}, 24 "M": {"start": "2015-01", "end": "2020-01"}, 25 "D": {"start": "2015-01-01", "end": "2020-01-01"}, 26} 27 28 29class PostcovidKwargs(LineKwargs): 30 """Keyword arguments for the post-COVID plot.""" 31 32 start_r: NotRequired[Period] # start of regression period 33 end_r: NotRequired[Period] # end of regression period 34 35 36# --- functions 37def get_projection(original: Series, to_period: Period) -> Series: 38 """Create a linear projection based on pre-COVID data. 39 40 Assumes the start of the data has been trimmed to the period before COVID. 41 42 Args: 43 original: Series - the original series with a PeriodIndex 44 Assume the index is a PeriodIndex, that is unique and monotonic increasing. 45 to_period: Period - the period to which the projection should extend. 46 47 Returns: 48 Series: A pandas Series with linear projection values using the same index as original. 49 50 Raises: 51 ValueError: If to_period is not within the original series index range. 52 53 """ 54 # --- using ordinals to manage gaps during the regression period (eg in Job Vacancy data) 55 op_index = cast("PeriodIndex", original.index) 56 y_regress = original[original.index <= to_period].to_numpy() 57 x_regress = array([p.ordinal for p in op_index if p <= to_period]) 58 x_complete = array([p.ordinal for p in op_index]) 59 m, b = polyfit(x_regress, y_regress, 1) 60 regression = Series((x_complete * m) + b, index=original.index) 61 regression = regression.reindex(period_range(start=op_index[0], end=op_index[-1])).interpolate( 62 method="linear" 63 ) 64 regression.index.name = original.index.name 65 return regression 66 67 68def regression_period(data: Series, **kwargs: Unpack[PostcovidKwargs]) -> tuple[Period, Period, bool]: 69 """Establish the regression period. 70 71 Args: 72 data: Series - the original time series data. 73 **kwargs: Additional keyword arguments. 74 75 Returns: 76 A tuple containing the start and end periods for regression, 77 and a boolean indicating if the period is robust. 78 79 """ 80 # --- check that the series index is a PeriodIndex with a valid frequency 81 series_index = PeriodIndex(data.index) 82 freq_str = series_index.freqstr 83 freq_key = freq_str[0] 84 if not freq_str or freq_key not in ("Q", "M", "D"): 85 raise ValueError("The series index must have a D, M or Q frequency") 86 87 # --- set the default regression period 88 default_periods = DEFAULT_PERIODS[freq_key] 89 start_regression = Period(default_periods["start"], freq=freq_str) 90 end_regression = Period(default_periods["end"], freq=freq_str) 91 92 # --- Override defaults with user-provided periods if specified 93 user_start = kwargs.pop("start_r", None) 94 user_end = kwargs.pop("end_r", None) 95 96 start_r = Period(user_start, freq=freq_str) if user_start else start_regression 97 end_r = Period(user_end, freq=freq_str) if user_end else end_regression 98 99 # --- Validate the regression period 100 robust = True 101 if start_r >= end_r: 102 print(f"Invalid regression period: {start_r=}, {end_r=}") 103 robust = False 104 no_nan_series = data.dropna() 105 if ( 106 number := len(no_nan_series[(no_nan_series.index >= start_r) & (no_nan_series.index <= end_r)]) 107 ) < MIN_REGRESSION_POINTS: 108 print(f"Insufficient data points (n={number}) for regression.") 109 robust = False 110 111 return start_r, end_r, robust 112 113 114def postcovid_plot(data: DataT, **kwargs: Unpack[PostcovidKwargs]) -> Axes: 115 """Plot a series with a PeriodIndex, including a post-COVID projection. 116 117 Args: 118 data: Series - the series to be plotted. 119 kwargs: PostcovidKwargs - plotting arguments. 120 121 Raises: 122 TypeError if series is not a pandas Series 123 TypeError if series does not have a PeriodIndex 124 ValueError if series does not have a D, M or Q frequency 125 ValueError if regression start is after regression end 126 127 """ 128 # --- check the kwargs 129 report_kwargs(caller=ME, **kwargs) 130 validate_kwargs(schema=PostcovidKwargs, caller=ME, **kwargs) 131 132 # --- check the data 133 data = check_clean_timeseries(data, ME) 134 if not isinstance(data, Series): 135 raise TypeError("The series argument must be a pandas Series") 136 137 # rely on line_plot() to validate kwargs, but remove any that are not relevant 138 if "plot_from" in kwargs: 139 print("Warning: the 'plot_from' argument is ignored in postcovid_plot().") 140 del kwargs["plot_from"] 141 142 # --- set the regression period 143 start_r, end_r, robust = regression_period(data, **kwargs) 144 kwargs.pop("start_r", None) # remove from kwargs to avoid confusion 145 kwargs.pop("end_r", None) # remove from kwargs to avoid confusion 146 if not robust: 147 print("No valid regression period found; plotting raw data only.") 148 return line_plot( 149 data, 150 **cast("LineKwargs", kwargs), 151 ) 152 153 # --- combine data and projection 154 if start_r < data.dropna().index.min(): 155 print(f"Caution: Regression start period pre-dates the series index: {start_r=}") 156 recent_data = data[data.index >= start_r].copy() 157 recent_data.name = "Series" 158 projection_data = get_projection(recent_data, end_r) 159 projection_data.name = "Pre-COVID projection" 160 161 # --- Create DataFrame with proper column alignment 162 combined_data = DataFrame( 163 { 164 projection_data.name: projection_data, 165 recent_data.name: recent_data, 166 } 167 ) 168 169 # --- activate plot settings 170 kwargs["width"] = kwargs.pop( 171 "width", 172 (get_setting("line_normal"), get_setting("line_wide")), 173 ) # series line is thicker than projection 174 kwargs["style"] = kwargs.pop("style", ("--", "-")) # dashed regression line 175 kwargs["label_series"] = kwargs.pop("label_series", True) 176 kwargs["annotate"] = kwargs.pop("annotate", (False, True)) # annotate series only 177 kwargs["color"] = kwargs.pop("color", ("darkblue", "#dd0000")) 178 kwargs["dropna"] = kwargs.pop("dropna", False) # drop NaN values 179 180 return line_plot( 181 combined_data, 182 **cast("LineKwargs", kwargs), 183 )
30class PostcovidKwargs(LineKwargs): 31 """Keyword arguments for the post-COVID plot.""" 32 33 start_r: NotRequired[Period] # start of regression period 34 end_r: NotRequired[Period] # end of regression period
Keyword arguments for the post-COVID plot.
38def get_projection(original: Series, to_period: Period) -> Series: 39 """Create a linear projection based on pre-COVID data. 40 41 Assumes the start of the data has been trimmed to the period before COVID. 42 43 Args: 44 original: Series - the original series with a PeriodIndex 45 Assume the index is a PeriodIndex, that is unique and monotonic increasing. 46 to_period: Period - the period to which the projection should extend. 47 48 Returns: 49 Series: A pandas Series with linear projection values using the same index as original. 50 51 Raises: 52 ValueError: If to_period is not within the original series index range. 53 54 """ 55 # --- using ordinals to manage gaps during the regression period (eg in Job Vacancy data) 56 op_index = cast("PeriodIndex", original.index) 57 y_regress = original[original.index <= to_period].to_numpy() 58 x_regress = array([p.ordinal for p in op_index if p <= to_period]) 59 x_complete = array([p.ordinal for p in op_index]) 60 m, b = polyfit(x_regress, y_regress, 1) 61 regression = Series((x_complete * m) + b, index=original.index) 62 regression = regression.reindex(period_range(start=op_index[0], end=op_index[-1])).interpolate( 63 method="linear" 64 ) 65 regression.index.name = original.index.name 66 return regression
Create a linear projection based on pre-COVID data.
Assumes the start of the data has been trimmed to the period before COVID.
Args: original: Series - the original series with a PeriodIndex Assume the index is a PeriodIndex, that is unique and monotonic increasing. to_period: Period - the period to which the projection should extend.
Returns: Series: A pandas Series with linear projection values using the same index as original.
Raises: ValueError: If to_period is not within the original series index range.
69def regression_period(data: Series, **kwargs: Unpack[PostcovidKwargs]) -> tuple[Period, Period, bool]: 70 """Establish the regression period. 71 72 Args: 73 data: Series - the original time series data. 74 **kwargs: Additional keyword arguments. 75 76 Returns: 77 A tuple containing the start and end periods for regression, 78 and a boolean indicating if the period is robust. 79 80 """ 81 # --- check that the series index is a PeriodIndex with a valid frequency 82 series_index = PeriodIndex(data.index) 83 freq_str = series_index.freqstr 84 freq_key = freq_str[0] 85 if not freq_str or freq_key not in ("Q", "M", "D"): 86 raise ValueError("The series index must have a D, M or Q frequency") 87 88 # --- set the default regression period 89 default_periods = DEFAULT_PERIODS[freq_key] 90 start_regression = Period(default_periods["start"], freq=freq_str) 91 end_regression = Period(default_periods["end"], freq=freq_str) 92 93 # --- Override defaults with user-provided periods if specified 94 user_start = kwargs.pop("start_r", None) 95 user_end = kwargs.pop("end_r", None) 96 97 start_r = Period(user_start, freq=freq_str) if user_start else start_regression 98 end_r = Period(user_end, freq=freq_str) if user_end else end_regression 99 100 # --- Validate the regression period 101 robust = True 102 if start_r >= end_r: 103 print(f"Invalid regression period: {start_r=}, {end_r=}") 104 robust = False 105 no_nan_series = data.dropna() 106 if ( 107 number := len(no_nan_series[(no_nan_series.index >= start_r) & (no_nan_series.index <= end_r)]) 108 ) < MIN_REGRESSION_POINTS: 109 print(f"Insufficient data points (n={number}) for regression.") 110 robust = False 111 112 return start_r, end_r, robust
Establish the regression period.
Args: data: Series - the original time series data. **kwargs: Additional keyword arguments.
Returns: A tuple containing the start and end periods for regression, and a boolean indicating if the period is robust.
115def postcovid_plot(data: DataT, **kwargs: Unpack[PostcovidKwargs]) -> Axes: 116 """Plot a series with a PeriodIndex, including a post-COVID projection. 117 118 Args: 119 data: Series - the series to be plotted. 120 kwargs: PostcovidKwargs - plotting arguments. 121 122 Raises: 123 TypeError if series is not a pandas Series 124 TypeError if series does not have a PeriodIndex 125 ValueError if series does not have a D, M or Q frequency 126 ValueError if regression start is after regression end 127 128 """ 129 # --- check the kwargs 130 report_kwargs(caller=ME, **kwargs) 131 validate_kwargs(schema=PostcovidKwargs, caller=ME, **kwargs) 132 133 # --- check the data 134 data = check_clean_timeseries(data, ME) 135 if not isinstance(data, Series): 136 raise TypeError("The series argument must be a pandas Series") 137 138 # rely on line_plot() to validate kwargs, but remove any that are not relevant 139 if "plot_from" in kwargs: 140 print("Warning: the 'plot_from' argument is ignored in postcovid_plot().") 141 del kwargs["plot_from"] 142 143 # --- set the regression period 144 start_r, end_r, robust = regression_period(data, **kwargs) 145 kwargs.pop("start_r", None) # remove from kwargs to avoid confusion 146 kwargs.pop("end_r", None) # remove from kwargs to avoid confusion 147 if not robust: 148 print("No valid regression period found; plotting raw data only.") 149 return line_plot( 150 data, 151 **cast("LineKwargs", kwargs), 152 ) 153 154 # --- combine data and projection 155 if start_r < data.dropna().index.min(): 156 print(f"Caution: Regression start period pre-dates the series index: {start_r=}") 157 recent_data = data[data.index >= start_r].copy() 158 recent_data.name = "Series" 159 projection_data = get_projection(recent_data, end_r) 160 projection_data.name = "Pre-COVID projection" 161 162 # --- Create DataFrame with proper column alignment 163 combined_data = DataFrame( 164 { 165 projection_data.name: projection_data, 166 recent_data.name: recent_data, 167 } 168 ) 169 170 # --- activate plot settings 171 kwargs["width"] = kwargs.pop( 172 "width", 173 (get_setting("line_normal"), get_setting("line_wide")), 174 ) # series line is thicker than projection 175 kwargs["style"] = kwargs.pop("style", ("--", "-")) # dashed regression line 176 kwargs["label_series"] = kwargs.pop("label_series", True) 177 kwargs["annotate"] = kwargs.pop("annotate", (False, True)) # annotate series only 178 kwargs["color"] = kwargs.pop("color", ("darkblue", "#dd0000")) 179 kwargs["dropna"] = kwargs.pop("dropna", False) # drop NaN values 180 181 return line_plot( 182 combined_data, 183 **cast("LineKwargs", kwargs), 184 )
Plot a series with a PeriodIndex, including a post-COVID projection.
Args: data: Series - the series to be plotted. kwargs: PostcovidKwargs - plotting arguments.
Raises: TypeError if series is not a pandas Series TypeError if series does not have a PeriodIndex ValueError if series does not have a D, M or Q frequency ValueError if regression start is after regression end