Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding: utf-8 -*- 

2""" 

3Various Statistical Tests 

4 

5Author: josef-pktd 

6License: BSD-3 

7 

8Notes 

9----- 

10Almost fully verified against R or Gretl, not all options are the same. 

11In many cases of Lagrange multiplier tests both the LM test and the F test is 

12returned. In some but not all cases, R has the option to choose the test 

13statistic. Some alternative test statistic results have not been verified. 

14 

15TODO 

16* refactor to store intermediate results 

17* how easy is it to attach a test that is a class to a result instance, 

18 for example CompareCox as a method compare_cox(self, other) ? 

19 

20missing: 

21 

22* pvalues for breaks_hansen 

23* additional options, compare with R, check where ddof is appropriate 

24* new tests: 

25 - breaks_ap, more recent breaks tests 

26 - specification tests against nonparametric alternatives 

27""" 

28from statsmodels.compat.pandas import deprecate_kwarg 

29from statsmodels.compat.python import iteritems 

30 

31from collections.abc import Iterable 

32 

33import numpy as np 

34import pandas as pd 

35from scipy import stats 

36 

37from statsmodels.regression.linear_model import OLS, RegressionResultsWrapper 

38from statsmodels.tsa.tsatools import lagmat 

39from statsmodels.tools.validation import (array_like, int_like, bool_like, 

40 string_like, dict_like, float_like) 

41from statsmodels.stats._lilliefors import (kstest_fit, lilliefors, 

42 kstest_normal, kstest_exponential) 

43from statsmodels.stats._adnorm import normal_ad, anderson_statistic 

44 

45__all__ = ["kstest_fit", "lilliefors", "kstest_normal", "kstest_exponential", 

46 "normal_ad", "compare_cox", "compare_j", "acorr_breusch_godfrey", 

47 "acorr_ljungbox", "acorr_lm", "het_arch", "het_breuschpagan", 

48 "het_goldfeldquandt", "het_white", "spec_white", "linear_lm", 

49 "linear_rainbow", "linear_harvey_collier", "anderson_statistic"] 

50 

51 

52NESTED_ERROR = """\ 

53The exog in results_x and in results_z are nested. {test} requires \ 

54that models are non-nested. 

55""" 

56 

57 

58# get the old signature back so the examples work 

59def unitroot_adf(x, maxlag=None, trendorder=0, autolag='AIC', store=False): 

60 import warnings 

61 warnings.warn("unitroot_adf is deprecated and will be removed after 0.11.", 

62 FutureWarning) 

63 from statsmodels.tsa.stattools import adfuller 

64 trendorder = {0: 'nc', 1: 'c', 2: 'ct', 3: 'ctt'}[trendorder] 

65 return adfuller(x, maxlag=maxlag, regression=trendorder, autolag=autolag, 

66 store=store, regresults=False) 

67 

68 

69def _check_nested_exog(small, large): 

70 """ 

71 Check if a larger exog nests a smaller exog 

72 

73 Parameters 

74 ---------- 

75 small : ndarray 

76 exog from smaller model 

77 large : ndarray 

78 exog from larger model 

79 

80 Returns 

81 ------- 

82 bool 

83 True if small is nested by large 

84 """ 

85 

86 if small.shape[1] > large.shape[1]: 

87 return False 

88 coef = np.linalg.lstsq(large, small, rcond=None)[0] 

89 err = small - large @ coef 

90 return np.linalg.matrix_rank(np.c_[large, err]) == large.shape[1] 

91 

92 

93def _check_nested_results(results_x, results_z): 

94 if not isinstance(results_x, RegressionResultsWrapper): 

95 raise TypeError("results_x must come from a linear regression model") 

96 if not isinstance(results_z, RegressionResultsWrapper): 

97 raise TypeError("results_z must come from a linear regression model") 

98 if not np.allclose(results_x.model.endog, results_z.model.endog): 

99 raise ValueError("endogenous variables in models are not the same") 

100 

101 x = results_x.model.exog 

102 z = results_z.model.exog 

103 

104 nested = False 

105 if x.shape[1] <= z.shape[1]: 

106 nested = nested or _check_nested_exog(x, z) 

107 else: 

108 nested = nested or _check_nested_exog(z, x) 

109 return nested 

110 

111 

112class ResultsStore(object): 

113 def __str__(self): 

114 return getattr(self, '_str', self.__class__.__name__) 

115 

116 

117def compare_cox(results_x, results_z, store=False): 

118 """ 

119 Compute the Cox test for non-nested models 

120 

121 Parameters 

122 ---------- 

123 results_x : Result instance 

124 result instance of first model 

125 results_z : Result instance 

126 result instance of second model 

127 store : bool, default False 

128 If true, then the intermediate results are returned. 

129 

130 Returns 

131 ------- 

132 tstat : float 

133 t statistic for the test that including the fitted values of the 

134 first model in the second model has no effect. 

135 pvalue : float 

136 two-sided pvalue for the t statistic 

137 res_store : ResultsStore, optional 

138 Intermediate results. Returned if store is True. 

139 

140 Notes 

141 ----- 

142 Tests of non-nested hypothesis might not provide unambiguous answers. 

143 The test should be performed in both directions and it is possible 

144 that both or neither test rejects. see [1]_ for more information. 

145 

146 Formulas from [1]_, section 8.3.4 translated to code 

147 

148 Matches results for Example 8.3 in Greene 

149 

150 References 

151 ---------- 

152 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall; 

153 5th edition. (2002). 

154 """ 

155 if _check_nested_results(results_x, results_z): 

156 raise ValueError(NESTED_ERROR.format(test="Cox comparison")) 

157 x = results_x.model.exog 

158 z = results_z.model.exog 

159 nobs = results_x.model.endog.shape[0] 

160 sigma2_x = results_x.ssr / nobs 

161 sigma2_z = results_z.ssr / nobs 

162 yhat_x = results_x.fittedvalues 

163 res_dx = OLS(yhat_x, z).fit() 

164 err_zx = res_dx.resid 

165 res_xzx = OLS(err_zx, x).fit() 

166 err_xzx = res_xzx.resid 

167 

168 sigma2_zx = sigma2_x + np.dot(err_zx.T, err_zx) / nobs 

169 c01 = nobs / 2. * (np.log(sigma2_z) - np.log(sigma2_zx)) 

170 v01 = sigma2_x * np.dot(err_xzx.T, err_xzx) / sigma2_zx ** 2 

171 q = c01 / np.sqrt(v01) 

172 pval = 2 * stats.norm.sf(np.abs(q)) 

173 

174 if store: 

175 res = ResultsStore() 

176 res.res_dx = res_dx 

177 res.res_xzx = res_xzx 

178 res.c01 = c01 

179 res.v01 = v01 

180 res.q = q 

181 res.pvalue = pval 

182 res.dist = stats.norm 

183 return q, pval, res 

184 

185 return q, pval 

186 

187 

188class CompareCox(object): 

189 """ 

190 Cox Test for non-nested models 

191 

192 .. deprecated:: 0.11 

193 

194 CompareCox is deprecated in favor of compare_cox. 

195 CompareCox will be removed after 0.12. 

196 """ 

197 

198 def __init__(self): 

199 import warnings 

200 warnings.warn("CompareCox is deprecated in favor of compare_cox and " 

201 "will be removed after 0.12.", FutureWarning) 

202 self.res_dx = self.res_xzx = self.c01 = None 

203 self.v01 = self.q = self.pvalue = self.dist = None 

204 

205 def run(self, results_x, results_z, attach=False): 

206 results = compare_cox(results_x, results_z, store=attach) 

207 if attach: 

208 res = results[-1] 

209 self.res_dx = res.res_dx 

210 self.res_xzx = res.res_xzx 

211 self.c01 = res.c01 

212 self.v01 = res.v01 

213 self.q = res.q 

214 self.pvalue = res.pvalue 

215 self.dist = res.dist 

216 

217 return results 

218 

219 def __call__(self, results_x, results_z): 

220 return self.run(results_x, results_z, attach=False) 

221 

222 

223def compare_j(results_x, results_z, store=False): 

224 """ 

225 Compute the J-test for non-nested models 

226 

227 Parameters 

228 ---------- 

229 results_x : RegressionResults 

230 The result instance of first model. 

231 results_z : RegressionResults 

232 The result instance of second model. 

233 store : bool, default False 

234 If true, then the intermediate results are returned. 

235 

236 Returns 

237 ------- 

238 tstat : float 

239 t statistic for the test that including the fitted values of the 

240 first model in the second model has no effect. 

241 pvalue : float 

242 two-sided pvalue for the t statistic 

243 res_store : ResultsStore, optional 

244 Intermediate results. Returned if store is True. 

245 

246 Notes 

247 ----- 

248 From description in Greene, section 8.3.3. Matches results for Example 

249 8.3, Greene. 

250 

251 Tests of non-nested hypothesis might not provide unambiguous answers. 

252 The test should be performed in both directions and it is possible 

253 that both or neither test rejects. see Greene for more information. 

254 

255 References 

256 ---------- 

257 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall; 

258 5th edition. (2002). 

259 """ 

260 # TODO: Allow cov to be specified 

261 if _check_nested_results(results_x, results_z): 

262 raise ValueError(NESTED_ERROR.format(test="J comparison")) 

263 y = results_x.model.endog 

264 z = results_z.model.exog 

265 yhat_x = results_x.fittedvalues 

266 res_zx = OLS(y, np.column_stack((yhat_x, z))).fit() 

267 tstat = res_zx.tvalues[0] 

268 pval = res_zx.pvalues[0] 

269 if store: 

270 res = ResultsStore() 

271 res.res_zx = res_zx 

272 res.dist = stats.t(res_zx.df_resid) 

273 res.teststat = tstat 

274 res.pvalue = pval 

275 return tstat, pval, res 

276 

277 return tstat, pval 

278 

279 

280class CompareJ(object): 

281 """ 

282 J-Test for comparing non-nested models 

283 

284 .. deprecated:: 0.11 

285 

286 CompareJ is deprecated in favor of compare_j. 

287 CompareJ will be removed after 0.12. 

288 """ 

289 

290 def __init__(self): 

291 import warnings 

292 warnings.warn("CompareJ is deprecated in favor of compare_j and will " 

293 "be removed after 0.12.", FutureWarning) 

294 self.res_zx = self.dist = self.teststat = self.pvalue = None 

295 

296 def run(self, results_x, results_z, attach=True): 

297 res = compare_j(results_x, results_z, store=attach) 

298 tstat, pval = res[:2] 

299 if attach: 

300 self.res_zx = res[-1].res_zx 

301 self.dist = res[-1].dist 

302 self.teststat = res[-1].teststat 

303 self.pvalue = res[-1].pvalue 

304 

305 return tstat, pval 

306 

307 def __call__(self, results_x, results_z): 

308 return self.run(results_x, results_z, attach=False) 

309 

310 

311def compare_encompassing(results_x, results_z, cov_type="nonrobust", 

312 cov_kwargs=None): 

313 r""" 

314 Davidson-MacKinnon encompassing test for comparing non-nested models 

315 

316 Parameters 

317 ---------- 

318 results_x : Result instance 

319 result instance of first model 

320 results_z : Result instance 

321 result instance of second model 

322 cov_type : str, default "nonrobust 

323 Covariance type. The default is "nonrobust` which uses the classic 

324 OLS covariance estimator. Specify one of "HC0", "HC1", "HC2", "HC3" 

325 to use White's covariance estimator. All covariance types supported 

326 by ``OLS.fit`` are accepted. 

327 cov_kwargs : dict, default None 

328 Dictionary of covariance options passed to ``OLS.fit``. See OLS.fit 

329 for more details. 

330 

331 Returns 

332 ------- 

333 DataFrame 

334 A DataFrame with two rows and four columns. The row labeled x 

335 contains results for the null that the model contained in 

336 results_x is equivalent to the encompassing model. The results in 

337 the row labeled z correspond to the test that the model contained 

338 in results_z are equivalent to the encompassing model. The columns 

339 are the test statistic, its p-value, and the numerator and 

340 denominator degrees of freedom. The test statistic has an F 

341 distribution. The numerator degree of freedom is the number of 

342 variables in the encompassing model that are not in the x or z model. 

343 The denominator degree of freedom is the number of observations minus 

344 the number of variables in the nesting model. 

345 

346 Notes 

347 ----- 

348 The null is that the fit produced using x is the same as the fit 

349 produced using both x and z. When testing whether x is encompassed, 

350 the model estimated is 

351 

352 .. math:: 

353 

354 Y = X\beta + Z_1\gamma + \epsilon 

355 

356 where :math:`Z_1` are the columns of :math:`Z` that are not spanned by 

357 :math:`X`. The null is :math:`H_0:\gamma=0`. When testing whether z is 

358 encompassed, the roles of :math:`X` and :math:`Z` are reversed. 

359 

360 Implementation of Davidson and MacKinnon (1993)'s encompassing test. 

361 Performs two Wald tests where models x and z are compared to a model 

362 that nests the two. The Wald tests are performed by using an OLS 

363 regression. 

364 """ 

365 if _check_nested_results(results_x, results_z): 

366 raise ValueError(NESTED_ERROR.format(test="Testing encompassing")) 

367 

368 y = results_x.model.endog 

369 x = results_x.model.exog 

370 z = results_z.model.exog 

371 

372 def _test_nested(endog, a, b, cov_est, cov_kwds): 

373 err = b - a @ np.linalg.lstsq(a, b, rcond=None)[0] 

374 u, s, v = np.linalg.svd(err) 

375 eps = np.finfo(np.double).eps 

376 tol = s.max(axis=-1, keepdims=True) * max(err.shape) * eps 

377 non_zero = np.abs(s) > tol 

378 aug = err @ v[:, non_zero] 

379 aug_reg = np.hstack([a, aug]) 

380 k_a = aug.shape[1] 

381 k = aug_reg.shape[1] 

382 

383 res = OLS(endog, aug_reg).fit(cov_type=cov_est, cov_kwds=cov_kwds) 

384 r_matrix = np.zeros((k_a, k)) 

385 r_matrix[:, -k_a:] = np.eye(k_a) 

386 test = res.wald_test(r_matrix, use_f=True) 

387 stat, pvalue = float(np.squeeze(test.statistic)), float(test.pvalue) 

388 df_num, df_denom = int(test.df_num), int(test.df_denom) 

389 return stat, pvalue, df_num, df_denom 

390 

391 x_nested = _test_nested(y, x, z, cov_type, cov_kwargs) 

392 z_nested = _test_nested(y, z, x, cov_type, cov_kwargs) 

393 return pd.DataFrame([x_nested, z_nested], 

394 index=["x", "z"], 

395 columns=["stat", "pvalue", "df_num", "df_denom"]) 

396 

397 

398def acorr_ljungbox(x, lags=None, boxpierce=False, model_df=0, period=None, 

399 return_df=None): 

400 """ 

401 Ljung-Box test of autocorrelation in residuals. 

402 

403 Parameters 

404 ---------- 

405 x : array_like 

406 The data series. The data is demeaned before the test statistic is 

407 computed. 

408 lags : {int, array_like}, default None 

409 If lags is an integer then this is taken to be the largest lag 

410 that is included, the test result is reported for all smaller lag 

411 length. If lags is a list or array, then all lags are included up to 

412 the largest lag in the list, however only the tests for the lags in 

413 the list are reported. If lags is None, then the default maxlag is 

414 currently min((nobs // 2 - 2), 40). After 0.12 this will change to 

415 min(10, nobs // 5). The default number of lags changes if period 

416 is set. 

417 boxpierce : bool, default False 

418 If true, then additional to the results of the Ljung-Box test also the 

419 Box-Pierce test results are returned. 

420 model_df : int, default 0 

421 Number of degrees of freedom consumed by the model. In an ARMA model, 

422 this value is usually p+q where p is the AR order and q is the MA 

423 order. This value is subtracted from the degrees-of-freedom used in 

424 the test so that the adjusted dof for the statistics are 

425 lags - model_df. If lags - model_df <= 0, then NaN is returned. 

426 period : int, default None 

427 The period of a Seasonal time series. Used to compute the max lag 

428 for seasonal data which uses min(2*period, nobs // 5) if set. If None, 

429 then the default rule is used to set the number of lags. When set, must 

430 be >= 2. 

431 return_df : bool, default None 

432 Flag indicating whether to return the result as a single DataFrame 

433 with columns lb_stat, lb_pvalue, and optionally bp_stat and bp_pvalue. 

434 After 0.12, this will become the only return method. Set to True 

435 to return the DataFrame or False to continue returning the 2 - 4 

436 output. If None (the default), a warning is raised. 

437 

438 Returns 

439 ------- 

440 lbvalue : float or array 

441 The Ljung-Box test statistic. 

442 pvalue : float or array 

443 The p-value based on chi-square distribution. The p-value is computed 

444 as 1.0 - chi2.cdf(lbvalue, dof) where dof is lag - model_df. If 

445 lag - model_df <= 0, then NaN is returned for the pvalue. 

446 bpvalue : (optional), float or array 

447 The test statistic for Box-Pierce test. 

448 bppvalue : (optional), float or array 

449 The p-value based for Box-Pierce test on chi-square distribution. 

450 The p-value is computed as 1.0 - chi2.cdf(bpvalue, dof) where dof is 

451 lag - model_df. If lag - model_df <= 0, then NaN is returned for the 

452 pvalue. 

453 

454 See Also 

455 -------- 

456 statsmodels.regression.linear_model.OLS.fit 

457 Regression model fitting. 

458 statsmodels.regression.linear_model.RegressionResults 

459 Results from linear regression models. 

460 

461 Notes 

462 ----- 

463 Ljung-Box and Box-Pierce statistic differ in their scaling of the 

464 autocorrelation function. Ljung-Box test is has better finite-sample 

465 properties. 

466 

467 References 

468 ---------- 

469 .. [*] Green, W. "Econometric Analysis," 5th ed., Pearson, 2003. 

470 

471 Examples 

472 -------- 

473 >>> import statsmodels.api as sm 

474 >>> data = sm.datasets.sunspots.load_pandas().data 

475 >>> res = sm.tsa.ARMA(data["SUNACTIVITY"], (1,1)).fit(disp=-1) 

476 >>> sm.stats.acorr_ljungbox(res.resid, lags=[10], return_df=True) 

477 lb_stat lb_pvalue 

478 10 214.106992 1.827374e-40 

479 """ 

480 x = array_like(x, "x") 

481 period = int_like(period, "period", optional=True) 

482 return_df = bool_like(return_df, "return_df", optional=True) 

483 model_df = int_like(model_df, "model_df", optional=False) 

484 if period is not None and period <= 1: 

485 raise ValueError("period must be >= 2") 

486 if model_df < 0: 

487 raise ValueError("model_df must be >= 0") 

488 nobs = x.shape[0] 

489 if period is not None: 

490 lags = np.arange(1, min(nobs // 5, 2 * period) + 1, dtype=np.int) 

491 elif lags is None: 

492 # TODO: Switch to min(10, nobs//5) after 0.12 

493 import warnings 

494 warnings.warn("The default value of lags is changing. After 0.12, " 

495 "this value will become min(10, nobs//5). Directly set" 

496 "lags to silence this warning.", FutureWarning) 

497 # Future 

498 # lags = np.arange(1, min(nobs // 5, 10) + 1, dtype=np.int) 

499 lags = np.arange(1, min((nobs // 2 - 2), 40) + 1, dtype=np.int) 

500 elif not isinstance(lags, Iterable): 

501 lags = int_like(lags, "lags") 

502 lags = np.arange(1, lags + 1) 

503 lags = array_like(lags, "lags", dtype="int") 

504 maxlag = lags.max() 

505 

506 # Avoid cyclic import 

507 from statsmodels.tsa.stattools import acf 

508 # normalize by nobs not (nobs-nlags) 

509 # SS: unbiased=False is default now 

510 sacf = acf(x, nlags=maxlag, fft=False) 

511 sacf2 = sacf[1:maxlag + 1] ** 2 / (nobs - np.arange(1, maxlag + 1)) 

512 qljungbox = nobs * (nobs + 2) * np.cumsum(sacf2)[lags - 1] 

513 adj_lags = lags - model_df 

514 pval = np.full_like(qljungbox, np.nan) 

515 loc = adj_lags > 0 

516 pval[loc] = stats.chi2.sf(qljungbox[loc], adj_lags[loc]) 

517 

518 if return_df is None: 

519 msg = ("The value returned will change to a single DataFrame after " 

520 "0.12 is released. Set return_df to True to use to return a " 

521 "DataFrame now. Set return_df to False to silence this " 

522 "warning.") 

523 import warnings 

524 warnings.warn(msg, FutureWarning) 

525 

526 if not boxpierce: 

527 if return_df: 

528 return pd.DataFrame({"lb_stat": qljungbox, "lb_pvalue": pval}, 

529 index=lags) 

530 return qljungbox, pval 

531 

532 qboxpierce = nobs * np.cumsum(sacf[1:maxlag + 1] ** 2)[lags - 1] 

533 pvalbp = np.full_like(qljungbox, np.nan) 

534 pvalbp[loc] = stats.chi2.sf(qboxpierce[loc], adj_lags[loc]) 

535 if return_df: 

536 return pd.DataFrame({"lb_stat": qljungbox, "lb_pvalue": pval, 

537 "bp_stat": qboxpierce, "bp_pvalue": pvalbp}, 

538 index=lags) 

539 

540 return qljungbox, pval, qboxpierce, pvalbp 

541 

542 

543@deprecate_kwarg("maxlag", "nlags") 

544def acorr_lm(resid, nlags=None, autolag="AIC", store=False, *, period=None, 

545 ddof=0, cov_type="nonrobust", cov_kwargs=None): 

546 """ 

547 Lagrange Multiplier tests for autocorrelation. 

548 

549 This is a generic Lagrange Multiplier test for autocorrelation. Returns 

550 Engle's ARCH test if resid is the squared residual array. Breusch-Godfrey 

551 is a variation on this test with additional exogenous variables. 

552 

553 Parameters 

554 ---------- 

555 resid : array_like 

556 Time series to test. 

557 nlags : int, default None 

558 Highest lag to use. The behavior of this parameter will change 

559 after 0.12. 

560 autolag : {str, None}, default "AIC" 

561 If None, then a fixed number of lags given by maxlag is used. This 

562 parameter is deprecated and will be removed after 0.12. Searching 

563 for model specification cannot control test size. 

564 store : bool, default False 

565 If true then the intermediate results are also returned. 

566 period : int, default none 

567 The period of a Seasonal time series. Used to compute the max lag 

568 for seasonal data which uses min(2*period, nobs // 5) if set. If None, 

569 then the default rule is used to set the number of lags. When set, must 

570 be >= 2. 

571 ddof : int, default 0 

572 The number of degrees of freedom consumed by the model used to 

573 produce resid. The default value is 0. 

574 cov_type : str, default "nonrobust" 

575 Covariance type. The default is "nonrobust` which uses the classic 

576 OLS covariance estimator. Specify one of "HC0", "HC1", "HC2", "HC3" 

577 to use White's covariance estimator. All covariance types supported 

578 by ``OLS.fit`` are accepted. 

579 cov_kwargs : dict, default None 

580 Dictionary of covariance options passed to ``OLS.fit``. See OLS.fit for 

581 more details. 

582 

583 Returns 

584 ------- 

585 lm : float 

586 Lagrange multiplier test statistic. 

587 lmpval : float 

588 The p-value for Lagrange multiplier test. 

589 fval : float 

590 The f statistic of the F test, alternative version of the same 

591 test based on F test for the parameter restriction. 

592 fpval : float 

593 The pvalue of the F test. 

594 res_store : ResultsStore, optional 

595 Intermediate results. Only returned if store=True. 

596 

597 See Also 

598 -------- 

599 het_arch 

600 Conditional heteroskedasticity testing. 

601 acorr_breusch_godfrey 

602 Breusch-Godfrey test for serial correlation. 

603 acorr_ljung_box 

604 Ljung-Box test for serial correlation. 

605 

606 Notes 

607 ----- 

608 The test statistic is computed as (nobs - ddof) * r2 where r2 is the 

609 R-squared from a regression on the residual on nlags lags of the 

610 residual. 

611 """ 

612 resid = array_like(resid, "resid", ndim=1) 

613 cov_type = string_like(cov_type, "cov_type") 

614 cov_kwargs = {} if cov_kwargs is None else cov_kwargs 

615 cov_kwargs = dict_like(cov_kwargs, "cov_kwargs") 

616 nobs = resid.shape[0] 

617 if period is not None and nlags is None: 

618 maxlag = min(nobs // 5, 2 * period) 

619 elif nlags is None: 

620 # TODO: Switch to min(10, nobs//5) after 0.12 

621 import warnings 

622 warnings.warn("The default value of nlags is changing. After 0.12, " 

623 "this value will become min(10, nobs//5). Directly set" 

624 "maxlags or period to silence this warning.", 

625 FutureWarning) 

626 # Future 

627 # maxlag = min(nobs // 5, 10) 

628 # Old: for adf from Greene referencing Schwert 1989 

629 maxlag = int(np.ceil(12. * np.power(nobs / 100., 1 / 4.))) 

630 else: 

631 maxlag = nlags 

632 

633 xdall = lagmat(resid[:, None], maxlag, trim="both") 

634 nobs = xdall.shape[0] 

635 xdall = np.c_[np.ones((nobs, 1)), xdall] 

636 xshort = resid[-nobs:] 

637 res_store = ResultsStore() 

638 

639 if autolag: 

640 # TODO: Deprecate this 

641 # Use same rules as autocorr 

642 # search for lag length with highest information criteria 

643 # Note: I use the same number of observations to have comparable IC 

644 import warnings 

645 warnings.warn("autolag is deprecated and will be removed after 0.12. " 

646 "Model selection before testing fails to control test " 

647 "size. Set autolag to False to silence this warning.", 

648 FutureWarning) 

649 results = {} 

650 for mlag in range(1, maxlag + 1): 

651 results[mlag] = OLS(xshort, xdall[:, :mlag + 1]).fit() 

652 

653 if autolag.lower() == "aic": 

654 bestic, icbestlag = min((v.aic, k) for k, v in iteritems(results)) 

655 elif autolag.lower() == "bic": 

656 icbest, icbestlag = min((v.bic, k) for k, v in iteritems(results)) 

657 else: 

658 raise ValueError("autolag can only be None, \"AIC\" or \"BIC\"") 

659 

660 # rerun ols with best ic 

661 xdall = lagmat(resid[:, None], icbestlag, trim="both") 

662 nobs = xdall.shape[0] 

663 xdall = np.c_[np.ones((nobs, 1)), xdall] 

664 xshort = resid[-nobs:] 

665 usedlag = icbestlag 

666 if store: 

667 res_store.results = results 

668 else: 

669 usedlag = maxlag 

670 

671 resols = OLS(xshort, xdall[:, :usedlag + 1]).fit(cov_type=cov_type, 

672 cov_kwargs=cov_kwargs) 

673 fval = resols.fvalue 

674 fpval = resols.f_pvalue 

675 if cov_type == "nonrobust": 

676 lm = (nobs - ddof) * resols.rsquared 

677 lmpval = stats.chi2.sf(lm, usedlag) 

678 # Note: deg of freedom for LM test: nvars - constant = lags used 

679 else: 

680 r_matrix = np.hstack((np.ones((usedlag, 1)), np.eye(usedlag))) 

681 test_stat = resols.wald_test(r_matrix, use_f=False) 

682 lm = float(test_stat.statistic) 

683 lmpval = test_stat.pvalue 

684 

685 if store: 

686 res_store.resols = resols 

687 res_store.usedlag = usedlag 

688 return lm, lmpval, fval, fpval, res_store 

689 else: 

690 return lm, lmpval, fval, fpval 

691 

692 

693@deprecate_kwarg("maxlag", "nlags") 

694def het_arch(resid, nlags=None, autolag=None, store=False, ddof=0): 

695 """ 

696 Engle's Test for Autoregressive Conditional Heteroscedasticity (ARCH). 

697 

698 Parameters 

699 ---------- 

700 resid : ndarray 

701 residuals from an estimation, or time series 

702 nlags : int, default None 

703 Highest lag to use. The behavior of this parameter will change 

704 after 0.12. 

705 autolag : {str, None}, default None 

706 If None, then a fixed number of lags given by maxlag is used. This 

707 parameter is deprecated and will be removed after 0.12. Searching 

708 for model specification cannot control test size. 

709 store : bool, default False 

710 If true then the intermediate results are also returned 

711 ddof : int, default 0 

712 If the residuals are from a regression, or ARMA estimation, then there 

713 are recommendations to correct the degrees of freedom by the number 

714 of parameters that have been estimated, for example ddof=p+q for an 

715 ARMA(p,q). 

716 

717 Returns 

718 ------- 

719 lm : float 

720 Lagrange multiplier test statistic 

721 lmpval : float 

722 p-value for Lagrange multiplier test 

723 fval : float 

724 fstatistic for F test, alternative version of the same test based on 

725 F test for the parameter restriction 

726 fpval : float 

727 pvalue for F test 

728 res_store : ResultsStore, optional 

729 Intermediate results. Returned if store is True. 

730 

731 Notes 

732 ----- 

733 verified against R:FinTS::ArchTest 

734 """ 

735 return acorr_lm(resid ** 2, nlags=nlags, autolag=autolag, store=store, 

736 ddof=ddof) 

737 

738 

739@deprecate_kwarg("results", "res") 

740def acorr_breusch_godfrey(res, nlags=None, store=False): 

741 """ 

742 Breusch-Godfrey Lagrange Multiplier tests for residual autocorrelation. 

743 

744 Parameters 

745 ---------- 

746 res : RegressionResults 

747 Estimation results for which the residuals are tested for serial 

748 correlation. 

749 nlags : int, default None 

750 Number of lags to include in the auxiliary regression. (nlags is 

751 highest lag). 

752 store : bool, default False 

753 If store is true, then an additional class instance that contains 

754 intermediate results is returned. 

755 

756 Returns 

757 ------- 

758 lm : float 

759 Lagrange multiplier test statistic. 

760 lmpval : float 

761 The p-value for Lagrange multiplier test. 

762 fval : float 

763 The value of the f statistic for F test, alternative version of the 

764 same test based on F test for the parameter restriction. 

765 fpval : float 

766 The pvalue for F test. 

767 res_store : ResultsStore 

768 A class instance that holds intermediate results. Only returned if 

769 store=True. 

770 

771 Notes 

772 ----- 

773 BG adds lags of residual to exog in the design matrix for the auxiliary 

774 regression with residuals as endog. See [1]_, section 12.7.1. 

775 

776 References 

777 ---------- 

778 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall; 

779 5th edition. (2002). 

780 """ 

781 

782 x = np.asarray(res.resid).squeeze() 

783 if x.ndim != 1: 

784 raise ValueError("Model resid must be a 1d array. Cannot be used on" 

785 " multivariate models.") 

786 exog_old = res.model.exog 

787 nobs = x.shape[0] 

788 if nlags is None: 

789 # TODO: Switch to min(10, nobs//5) after 0.12 

790 import warnings 

791 warnings.warn("The default value of nlags is changing. After 0.12, " 

792 "this value will become min(10, nobs//5). Directly set" 

793 "nlags or period to silence this warning.", 

794 FutureWarning) 

795 

796 nlags = np.trunc(12. * np.power(nobs / 100., 1 / 4.)) 

797 nlags = int(nlags) 

798 

799 x = np.concatenate((np.zeros(nlags), x)) 

800 

801 xdall = lagmat(x[:, None], nlags, trim="both") 

802 nobs = xdall.shape[0] 

803 xdall = np.c_[np.ones((nobs, 1)), xdall] 

804 xshort = x[-nobs:] 

805 exog = np.column_stack((exog_old, xdall)) 

806 k_vars = exog.shape[1] 

807 

808 resols = OLS(xshort, exog).fit() 

809 ft = resols.f_test(np.eye(nlags, k_vars, k_vars - nlags)) 

810 fval = ft.fvalue 

811 fpval = ft.pvalue 

812 fval = float(np.squeeze(fval)) 

813 fpval = float(np.squeeze(fpval)) 

814 lm = nobs * resols.rsquared 

815 lmpval = stats.chi2.sf(lm, nlags) 

816 # Note: degrees of freedom for LM test is nvars minus constant = usedlags 

817 

818 if store: 

819 res_store = ResultsStore() 

820 res_store.resols = resols 

821 res_store.usedlag = nlags 

822 return lm, lmpval, fval, fpval, res_store 

823 else: 

824 return lm, lmpval, fval, fpval 

825 

826 

827def het_breuschpagan(resid, exog_het): 

828 r""" 

829 Breusch-Pagan Lagrange Multiplier test for heteroscedasticity 

830 

831 The tests the hypothesis that the residual variance does not depend on 

832 the variables in x in the form 

833 

834 .. :math: \sigma_i = \sigma * f(\alpha_0 + \alpha z_i) 

835 

836 Homoscedasticity implies that :math:`\alpha=0`. 

837 

838 Parameters 

839 ---------- 

840 resid : array_like 

841 For the Breusch-Pagan test, this should be the residual of a 

842 regression. If an array is given in exog, then the residuals are 

843 calculated by the an OLS regression or resid on exog. In this case 

844 resid should contain the dependent variable. Exog can be the same as x. 

845 exog_het : array_like 

846 This contains variables suspected of being related to 

847 heteroscedasticity in resid. 

848 

849 Returns 

850 ------- 

851 lm : float 

852 lagrange multiplier statistic 

853 lm_pvalue :float 

854 p-value of lagrange multiplier test 

855 fvalue : float 

856 f-statistic of the hypothesis that the error variance does not depend 

857 on x 

858 f_pvalue : float 

859 p-value for the f-statistic 

860 

861 Notes 

862 ----- 

863 Assumes x contains constant (for counting dof and calculation of R^2). 

864 In the general description of LM test, Greene mentions that this test 

865 exaggerates the significance of results in small or moderately large 

866 samples. In this case the F-statistic is preferable. 

867 

868 **Verification** 

869 

870 Chisquare test statistic is exactly (<1e-13) the same result as bptest 

871 in R-stats with defaults (studentize=True). 

872 

873 **Implementation** 

874 

875 This is calculated using the generic formula for LM test using $R^2$ 

876 (Greene, section 17.6) and not with the explicit formula 

877 (Greene, section 11.4.3). 

878 The degrees of freedom for the p-value assume x is full rank. 

879 

880 References 

881 ---------- 

882 .. [1] Greene, W. H. Econometric Analysis. New Jersey. Prentice Hall; 

883 5th edition. (2002). 

884 .. [2] Breusch, T. S.; Pagan, A. R. (1979). "A Simple Test for 

885 Heteroskedasticity and Random Coefficient Variation". Econometrica. 

886 47 (5): 1287–1294. 

887 """ 

888 

889 x = np.asarray(exog_het) 

890 y = np.asarray(resid) ** 2 

891 nobs, nvars = x.shape 

892 resols = OLS(y, x).fit() 

893 fval = resols.fvalue 

894 fpval = resols.f_pvalue 

895 lm = nobs * resols.rsquared 

896 # Note: degrees of freedom for LM test is nvars minus constant 

897 return lm, stats.chi2.sf(lm, nvars - 1), fval, fpval 

898 

899 

900def het_white(resid, exog): 

901 """ 

902 White's Lagrange Multiplier Test for Heteroscedasticity. 

903 

904 Parameters 

905 ---------- 

906 resid : array_like 

907 The residuals. The squared residuals are used as the endogenous 

908 variable. 

909 exog : array_like 

910 The explanatory variables for the variance. Squares and interaction 

911 terms are automatically included in the auxiliary regression. 

912 

913 Returns 

914 ------- 

915 lm : float 

916 The lagrange multiplier statistic. 

917 lm_pvalue :float 

918 The p-value of lagrange multiplier test. 

919 fvalue : float 

920 The f-statistic of the hypothesis that the error variance does not 

921 depend on x. This is an alternative test variant not the original 

922 LM test. 

923 f_pvalue : float 

924 The p-value for the f-statistic. 

925 

926 Notes 

927 ----- 

928 Assumes x contains constant (for counting dof). 

929 

930 question: does f-statistic make sense? constant ? 

931 

932 References 

933 ---------- 

934 Greene section 11.4.1 5th edition p. 222. Test statistic reproduces 

935 Greene 5th, example 11.3. 

936 """ 

937 x = array_like(exog, "exog", ndim=2) 

938 y = array_like(resid, "resid", ndim=2, shape=(x.shape[0], 1)) 

939 if x.shape[1] < 2: 

940 raise ValueError("White's heteroskedasticity test requires exog to" 

941 "have at least two columns where one is a constant.") 

942 nobs, nvars0 = x.shape 

943 i0, i1 = np.triu_indices(nvars0) 

944 exog = x[:, i0] * x[:, i1] 

945 nobs, nvars = exog.shape 

946 assert nvars == nvars0 * (nvars0 - 1) / 2. + nvars0 

947 resols = OLS(y ** 2, exog).fit() 

948 fval = resols.fvalue 

949 fpval = resols.f_pvalue 

950 lm = nobs * resols.rsquared 

951 # Note: degrees of freedom for LM test is nvars minus constant 

952 # degrees of freedom take possible reduced rank in exog into account 

953 # df_model checks the rank to determine df 

954 # extra calculation that can be removed: 

955 assert resols.df_model == np.linalg.matrix_rank(exog) - 1 

956 lmpval = stats.chi2.sf(lm, resols.df_model) 

957 return lm, lmpval, fval, fpval 

958 

959 

960def het_goldfeldquandt(y, x, idx=None, split=None, drop=None, 

961 alternative="increasing", store=False): 

962 """ 

963 Goldfeld-Quandt homoskedasticity test. 

964 

965 This test examines whether the residual variance is the same in 2 

966 subsamples. 

967 

968 Parameters 

969 ---------- 

970 y : array_like 

971 endogenous variable 

972 x : array_like 

973 exogenous variable, regressors 

974 idx : int, default None 

975 column index of variable according to which observations are 

976 sorted for the split 

977 split : {int, float}, default None 

978 If an integer, this is the index at which sample is split. 

979 If a float in 0<split<1 then split is interpreted as fraction 

980 of the observations in the first sample. If None, uses nobs//2. 

981 drop : {int, float}, default None 

982 If this is not None, then observation are dropped from the middle 

983 part of the sorted series. If 0<split<1 then split is interpreted 

984 as fraction of the number of observations to be dropped. 

985 Note: Currently, observations are dropped between split and 

986 split+drop, where split and drop are the indices (given by rounding 

987 if specified as fraction). The first sample is [0:split], the 

988 second sample is [split+drop:] 

989 alternative : {"increasing", "decreasing", "two-sided"} 

990 The default is increasing. This specifies the alternative for the 

991 p-value calculation. 

992 store : bool, default False 

993 Flag indicating to return the regression results 

994 

995 Returns 

996 ------- 

997 fval : float 

998 value of the F-statistic 

999 pval : float 

1000 p-value of the hypothesis that the variance in one subsample is 

1001 larger than in the other subsample 

1002 ordering : str 

1003 The ordering used in the alternative. 

1004 res_store : ResultsStore, optional 

1005 Storage for the intermediate and final results that are calculated 

1006 

1007 Notes 

1008 ----- 

1009 The Null hypothesis is that the variance in the two sub-samples are the 

1010 same. The alternative hypothesis, can be increasing, i.e. the variance 

1011 in the second sample is larger than in the first, or decreasing or 

1012 two-sided. 

1013 

1014 Results are identical R, but the drop option is defined differently. 

1015 (sorting by idx not tested yet) 

1016 """ 

1017 x = np.asarray(x) 

1018 y = np.asarray(y) # **2 

1019 nobs, nvars = x.shape 

1020 if split is None: 

1021 split = nobs // 2 

1022 elif (0 < split) and (split < 1): 

1023 split = int(nobs * split) 

1024 

1025 if drop is None: 

1026 start2 = split 

1027 elif (0 < drop) and (drop < 1): 

1028 start2 = split + int(nobs * drop) 

1029 else: 

1030 start2 = split + drop 

1031 

1032 if idx is not None: 

1033 xsortind = np.argsort(x[:, idx]) 

1034 y = y[xsortind] 

1035 x = x[xsortind, :] 

1036 

1037 resols1 = OLS(y[:split], x[:split]).fit() 

1038 resols2 = OLS(y[start2:], x[start2:]).fit() 

1039 fval = resols2.mse_resid / resols1.mse_resid 

1040 # if fval>1: 

1041 if alternative.lower() in ["i", "inc", "increasing"]: 

1042 fpval = stats.f.sf(fval, resols1.df_resid, resols2.df_resid) 

1043 ordering = "increasing" 

1044 elif alternative.lower() in ["d", "dec", "decreasing"]: 

1045 fpval = stats.f.sf(1. / fval, resols2.df_resid, resols1.df_resid) 

1046 ordering = "decreasing" 

1047 elif alternative.lower() in ["2", "2-sided", "two-sided"]: 

1048 fpval_sm = stats.f.cdf(fval, resols2.df_resid, resols1.df_resid) 

1049 fpval_la = stats.f.sf(fval, resols2.df_resid, resols1.df_resid) 

1050 fpval = 2 * min(fpval_sm, fpval_la) 

1051 ordering = "two-sided" 

1052 else: 

1053 raise ValueError("invalid alternative") 

1054 

1055 if store: 

1056 res = ResultsStore() 

1057 res.__doc__ = "Test Results for Goldfeld-Quandt test of" \ 

1058 "heterogeneity" 

1059 res.fval = fval 

1060 res.fpval = fpval 

1061 res.df_fval = (resols2.df_resid, resols1.df_resid) 

1062 res.resols1 = resols1 

1063 res.resols2 = resols2 

1064 res.ordering = ordering 

1065 res.split = split 

1066 res._str = """\ 

1067The Goldfeld-Quandt test for null hypothesis that the variance in the second 

1068subsample is %s than in the first subsample: 

1069F-statistic =%8.4f and p-value =%8.4f""" % (ordering, fval, fpval) 

1070 

1071 return fval, fpval, ordering, res 

1072 

1073 return fval, fpval, ordering 

1074 

1075 

1076class HetGoldfeldQuandt(object): 

1077 """ 

1078 Test whether variance is the same in 2 subsamples 

1079 

1080 .. deprecated:: 0.11 

1081 

1082 HetGoldfeldQuandt is deprecated in favor of het_goldfeldquandt. 

1083 HetGoldfeldQuandt will be removed after 0.12. 

1084 

1085 See Also 

1086 -------- 

1087 het_goldfeldquant 

1088 Goldfeld-Quant heteroskedasticity test. 

1089 """ 

1090 

1091 def __init__(self): 

1092 import warnings 

1093 warnings.warn("HetGoldfeldQuandt is deprecated in favor of" 

1094 "het_goldfeldquandt. HetGoldfeldQuandt will be removed" 

1095 "after 0.12.", 

1096 FutureWarning) 

1097 self.fval = self.fpval = self.df_fval = self.resols1 = None 

1098 self.resols2 = self.ordering = self.split = self._str = None 

1099 

1100 def run(self, y, x, idx=None, split=None, drop=None, 

1101 alternative="increasing", attach=True): 

1102 """ 

1103 .. deprecated:: 0.11 

1104 

1105 Use het_goldfeldquant instead. 

1106 

1107 See Also 

1108 -------- 

1109 het_goldfeldquant 

1110 Goldfeld-Quant heteroskedasticity test. 

1111 """ 

1112 res = het_goldfeldquandt(y, x, idx=idx, split=split, drop=drop, 

1113 alternative=alternative, store=attach) 

1114 if attach: 

1115 store = res[-1] 

1116 self.__doc__ = store.__doc__ 

1117 self.fval = store.fval 

1118 self.fpval = store.fpval 

1119 self.df_fval = store.df_fval 

1120 self.resols1 = store.resols1 

1121 self.resols2 = store.resols2 

1122 self.ordering = store.ordering 

1123 self.split = store.split 

1124 

1125 return res[:3] 

1126 

1127 def __call__(self, y, x, idx=None, split=None, drop=None, 

1128 alternative="increasing"): 

1129 return self.run(y, x, idx=idx, split=split, drop=drop, 

1130 attach=False, alternative=alternative) 

1131 

1132 

1133@deprecate_kwarg("result", "res") 

1134def linear_reset(res, power=3, test_type="fitted", use_f=False, 

1135 cov_type="nonrobust", cov_kwargs=None): 

1136 r""" 

1137 Ramsey's RESET test for neglected nonlinearity 

1138 

1139 Parameters 

1140 ---------- 

1141 res : RegressionResults 

1142 A results instance from a linear regression. 

1143 power : {int, List[int]}, default 3 

1144 The maximum power to include in the model, if an integer. Includes 

1145 powers 2, 3, ..., power. If an list of integers, includes all powers 

1146 in the list. 

1147 test_type : str, default "fitted" 

1148 The type of augmentation to use: 

1149 

1150 * "fitted" : (default) Augment regressors with powers of fitted values. 

1151 * "exog" : Augment exog with powers of exog. Excludes binary 

1152 regressors. 

1153 * "princomp": Augment exog with powers of first principal component of 

1154 exog. 

1155 use_f : bool, default False 

1156 Flag indicating whether an F-test should be used (True) or a 

1157 chi-square test (False). 

1158 cov_type : str, default "nonrobust 

1159 Covariance type. The default is "nonrobust` which uses the classic 

1160 OLS covariance estimator. Specify one of "HC0", "HC1", "HC2", "HC3" 

1161 to use White's covariance estimator. All covariance types supported 

1162 by ``OLS.fit`` are accepted. 

1163 cov_kwargs : dict, default None 

1164 Dictionary of covariance options passed to ``OLS.fit``. See OLS.fit 

1165 for more details. 

1166 

1167 Returns 

1168 ------- 

1169 ContrastResults 

1170 Test results for Ramsey's Reset test. See notes for implementation 

1171 details. 

1172 

1173 Notes 

1174 ----- 

1175 The RESET test uses an augmented regression of the form 

1176 

1177 .. math:: 

1178 

1179 Y = X\beta + Z\gamma + \epsilon 

1180 

1181 where :math:`Z` are a set of regressors that are one of: 

1182 

1183 * Powers of :math:`X\hat{\beta}` from the original regression. 

1184 * Powers of :math:`X`, excluding the constant and binary regressors. 

1185 * Powers of the first principal component of :math:`X`. If the 

1186 model includes a constant, this column is dropped before computing 

1187 the principal component. In either case, the principal component 

1188 is extracted from the correlation matrix of remaining columns. 

1189 

1190 The test is a Wald test of the null :math:`H_0:\gamma=0`. If use_f 

1191 is True, then the quadratic-form test statistic is divided by the 

1192 number of restrictions and the F distribution is used to compute 

1193 the critical value. 

1194 """ 

1195 if not isinstance(res, RegressionResultsWrapper): 

1196 raise TypeError("result must come from a linear regression model") 

1197 if bool(res.model.k_constant) and res.model.exog.shape[1] == 1: 

1198 raise ValueError("exog contains only a constant column. The RESET " 

1199 "test requires exog to have at least 1 " 

1200 "non-constant column.") 

1201 test_type = string_like(test_type, "test_type", 

1202 options=("fitted", "exog", "princomp")) 

1203 cov_kwargs = dict_like(cov_kwargs, "cov_kwargs", optional=True) 

1204 use_f = bool_like(use_f, "use_f") 

1205 if isinstance(power, int): 

1206 if power < 2: 

1207 raise ValueError("power must be >= 2") 

1208 power = np.arange(2, power + 1, dtype=np.int) 

1209 else: 

1210 try: 

1211 power = np.array(power, dtype=np.int) 

1212 except Exception: 

1213 raise ValueError("power must be an integer or list of integers") 

1214 if power.ndim != 1 or len(set(power)) != power.shape[0] or \ 

1215 (power < 2).any(): 

1216 raise ValueError("power must contains distinct integers all >= 2") 

1217 exog = res.model.exog 

1218 if test_type == "fitted": 

1219 aug = res.fittedvalues[:, None] 

1220 elif test_type == "exog": 

1221 # Remove constant and binary 

1222 aug = res.model.exog 

1223 binary = ((exog == exog.max(axis=0)) | (exog == exog.min(axis=0))) 

1224 binary = binary.all(axis=0) 

1225 if binary.all(): 

1226 raise ValueError("Model contains only constant or binary data") 

1227 aug = aug[:, ~binary] 

1228 else: 

1229 from statsmodels.multivariate.pca import PCA 

1230 aug = exog 

1231 if res.k_constant: 

1232 retain = np.arange(aug.shape[1]).tolist() 

1233 retain.pop(int(res.model.data.const_idx)) 

1234 aug = aug[:, retain] 

1235 pca = PCA(aug, ncomp=1, standardize=bool(res.k_constant), 

1236 demean=bool(res.k_constant), method="nipals") 

1237 aug = pca.factors[:, :1] 

1238 aug_exog = np.hstack([exog] + [aug ** p for p in power]) 

1239 mod_class = res.model.__class__ 

1240 mod = mod_class(res.model.data.endog, aug_exog) 

1241 cov_kwargs = {} if cov_kwargs is None else cov_kwargs 

1242 res = mod.fit(cov_type=cov_type, cov_kwargs=cov_kwargs) 

1243 nrestr = aug_exog.shape[1] - exog.shape[1] 

1244 nparams = aug_exog.shape[1] 

1245 r_mat = np.eye(nrestr, nparams, k=nparams-nrestr) 

1246 return res.wald_test(r_mat, use_f=use_f) 

1247 

1248 

1249def linear_harvey_collier(res, order_by=None): 

1250 """ 

1251 Harvey Collier test for linearity 

1252 

1253 The Null hypothesis is that the regression is correctly modeled as linear. 

1254 

1255 Parameters 

1256 ---------- 

1257 res : RegressionResults 

1258 A results instance from a linear regression. 

1259 order_by : array_like, default None 

1260 Integer array specifying the order of the residuals. If not provided, 

1261 the order of the residuals is not changed. If provided, must have 

1262 the same number of observations as the endogenous variable. 

1263 

1264 Returns 

1265 ------- 

1266 tvalue : float 

1267 The test statistic, based on ttest_1sample. 

1268 pvalue : float 

1269 The pvalue of the test. 

1270 

1271 Notes 

1272 ----- 

1273 This test is a t-test that the mean of the recursive ols residuals is zero. 

1274 Calculating the recursive residuals might take some time for large samples. 

1275 """ 

1276 # I think this has different ddof than 

1277 # B.H. Baltagi, Econometrics, 2011, chapter 8 

1278 # but it matches Gretl and R:lmtest, pvalue at decimal=13 

1279 rr = recursive_olsresiduals(res, skip=3, alpha=0.95, order_by=order_by) 

1280 

1281 return stats.ttest_1samp(rr[3][3:], 0) 

1282 

1283 

1284def linear_rainbow(res, frac=0.5, order_by=None, use_distance=False, 

1285 center=None): 

1286 """ 

1287 Rainbow test for linearity 

1288 

1289 The null hypothesis is the fit of the model using full sample is the same 

1290 as using a central subset. The alternative is that the fits are difference. 

1291 The rainbow test has power against many different forms of nonlinearity. 

1292 

1293 Parameters 

1294 ---------- 

1295 res : RegressionResults 

1296 A results instance from a linear regression. 

1297 frac : float, default 0.5 

1298 The fraction of the data to include in the center model. 

1299 order_by : {ndarray, str, List[str]}, default None 

1300 If an ndarray, the values in the array are used to sort the 

1301 observations. If a string or a list of strings, these are interpreted 

1302 as column name(s) which are then used to lexicographically sort the 

1303 data. 

1304 use_distance : bool, default False 

1305 Flag indicating whether data should be ordered by the Mahalanobis 

1306 distance to the center. 

1307 center : {float, int}, default None 

1308 If a float, the value must be in [0, 1] and the center is center * 

1309 nobs of the ordered data. If an integer, must be in [0, nobs) and 

1310 is interpreted as the observation of the ordered data to use. 

1311 

1312 Returns 

1313 ------- 

1314 fstat : float 

1315 The test statistic based on the F test. 

1316 pvalue : float 

1317 The pvalue of the test. 

1318 

1319 Notes 

1320 ----- 

1321 This test assumes residuals are homoskedastic and may reject a correct 

1322 linear specification if the residuals are heteroskedastic. 

1323 """ 

1324 if not isinstance(res, RegressionResultsWrapper): 

1325 raise TypeError("res must be a results instance from a linear model.") 

1326 frac = float_like(frac, "frac") 

1327 

1328 use_distance = bool_like(use_distance, "use_distance") 

1329 nobs = res.nobs 

1330 endog = res.model.endog 

1331 exog = res.model.exog 

1332 if order_by is not None and use_distance: 

1333 raise ValueError("order_by and use_distance cannot be simultaneously" 

1334 "used.") 

1335 if order_by is not None: 

1336 if isinstance(order_by, np.ndarray): 

1337 order_by = array_like(order_by, "order_by", ndim=1, dtype="int") 

1338 else: 

1339 if isinstance(order_by, str): 

1340 order_by = [order_by] 

1341 try: 

1342 cols = res.model.data.orig_exog[order_by].copy() 

1343 except (IndexError, KeyError): 

1344 raise TypeError("order_by must contain valid column names " 

1345 "from the exog data used to construct res," 

1346 "and exog must be a pandas DataFrame.") 

1347 name = "__index__" 

1348 while name in cols: 

1349 name += '_' 

1350 cols[name] = np.arange(cols.shape[0]) 

1351 cols = cols.sort_values(order_by) 

1352 order_by = np.asarray(cols[name]) 

1353 endog = endog[order_by] 

1354 exog = exog[order_by] 

1355 if use_distance: 

1356 center = int(nobs) // 2 if center is None else center 

1357 if isinstance(center, float): 

1358 if not 0.0 <= center <= 1.0: 

1359 raise ValueError("center must be in (0, 1) when a float.") 

1360 center = int(center * (nobs-1)) 

1361 else: 

1362 center = int_like(center, "center") 

1363 if not 0 < center < nobs - 1: 

1364 raise ValueError("center must be in [0, nobs) when an int.") 

1365 center_obs = exog[center:center+1] 

1366 from scipy.spatial.distance import cdist 

1367 try: 

1368 err = exog - center_obs 

1369 vi = np.linalg.inv(err.T @ err / nobs) 

1370 except np.linalg.LinAlgError: 

1371 err = exog - exog.mean(0) 

1372 vi = np.linalg.inv(err.T @ err / nobs) 

1373 dist = cdist(exog, center_obs, metric='mahalanobis', VI=vi) 

1374 idx = np.argsort(dist.ravel()) 

1375 endog = endog[idx] 

1376 exog = exog[idx] 

1377 

1378 lowidx = np.ceil(0.5 * (1 - frac) * nobs).astype(int) 

1379 uppidx = np.floor(lowidx + frac * nobs).astype(int) 

1380 if uppidx - lowidx < exog.shape[1]: 

1381 raise ValueError("frac is too small to perform test. frac * nobs" 

1382 "must be greater than the number of exogenous" 

1383 "variables in the model.") 

1384 mi_sl = slice(lowidx, uppidx) 

1385 res_mi = OLS(endog[mi_sl], exog[mi_sl]).fit() 

1386 nobs_mi = res_mi.model.endog.shape[0] 

1387 ss_mi = res_mi.ssr 

1388 ss = res.ssr 

1389 fstat = (ss - ss_mi) / (nobs - nobs_mi) / ss_mi * res_mi.df_resid 

1390 pval = stats.f.sf(fstat, nobs - nobs_mi, res_mi.df_resid) 

1391 return fstat, pval 

1392 

1393 

1394def linear_lm(resid, exog, func=None): 

1395 """ 

1396 Lagrange multiplier test for linearity against functional alternative 

1397 

1398 # TODO: Remove the restriction 

1399 limitations: Assumes currently that the first column is integer. 

1400 Currently it does not check whether the transformed variables contain NaNs, 

1401 for example log of negative number. 

1402 

1403 Parameters 

1404 ---------- 

1405 resid : ndarray 

1406 residuals of a regression 

1407 exog : ndarray 

1408 exogenous variables for which linearity is tested 

1409 func : callable, default None 

1410 If func is None, then squares are used. func needs to take an array 

1411 of exog and return an array of transformed variables. 

1412 

1413 Returns 

1414 ------- 

1415 lm : float 

1416 Lagrange multiplier test statistic 

1417 lm_pval : float 

1418 p-value of Lagrange multiplier tes 

1419 ftest : ContrastResult instance 

1420 the results from the F test variant of this test 

1421 

1422 Notes 

1423 ----- 

1424 Written to match Gretl's linearity test. The test runs an auxiliary 

1425 regression of the residuals on the combined original and transformed 

1426 regressors. The Null hypothesis is that the linear specification is 

1427 correct. 

1428 """ 

1429 if func is None: 

1430 def func(x): 

1431 return np.power(x, 2) 

1432 

1433 exog_aux = np.column_stack((exog, func(exog[:, 1:]))) 

1434 

1435 nobs, k_vars = exog.shape 

1436 ls = OLS(resid, exog_aux).fit() 

1437 ftest = ls.f_test(np.eye(k_vars - 1, k_vars * 2 - 1, k_vars)) 

1438 lm = nobs * ls.rsquared 

1439 lm_pval = stats.chi2.sf(lm, k_vars - 1) 

1440 return lm, lm_pval, ftest 

1441 

1442 

1443def spec_white(resid, exog): 

1444 """ 

1445 White's Two-Moment Specification Test 

1446 

1447 Parameters 

1448 ---------- 

1449 resid : array_like 

1450 OLS residuals. 

1451 exog : array_like 

1452 OLS design matrix. 

1453 

1454 Returns 

1455 ------- 

1456 stat : float 

1457 The test statistic. 

1458 pval : float 

1459 A chi-square p-value for test statistic. 

1460 dof : int 

1461 The degrees of freedom. 

1462 

1463 See Also 

1464 -------- 

1465 het_white 

1466 White's test for heteroskedasticity. 

1467 

1468 Notes 

1469 ----- 

1470 Implements the two-moment specification test described by White's 

1471 Theorem 2 (1980, p. 823) which compares the standard OLS covariance 

1472 estimator with White's heteroscedasticity-consistent estimator. The 

1473 test statistic is shown to be chi-square distributed. 

1474 

1475 Null hypothesis is homoscedastic and correctly specified. 

1476 

1477 Assumes the OLS design matrix contains an intercept term and at least 

1478 one variable. The intercept is removed to calculate the test statistic. 

1479 

1480 Interaction terms (squares and crosses of OLS regressors) are added to 

1481 the design matrix to calculate the test statistic. 

1482 

1483 Degrees-of-freedom (full rank) = nvar + nvar * (nvar + 1) / 2 

1484 

1485 Linearly dependent columns are removed to avoid singular matrix error. 

1486 

1487 References 

1488 ---------- 

1489 .. [*] White, H. (1980). A heteroskedasticity-consistent covariance matrix 

1490 estimator and a direct test for heteroscedasticity. Econometrica, 48: 

1491 817-838. 

1492 """ 

1493 x = array_like(exog, "exog", ndim=2) 

1494 e = array_like(resid, "resid", ndim=1) 

1495 if x.shape[1] < 2 or not np.any(np.ptp(x, 0) == 0.0): 

1496 raise ValueError("White's specification test requires at least two" 

1497 "columns where one is a constant.") 

1498 

1499 # add interaction terms 

1500 i0, i1 = np.triu_indices(x.shape[1]) 

1501 exog = np.delete(x[:, i0] * x[:, i1], 0, 1) 

1502 

1503 # collinearity check - see _fit_collinear 

1504 atol = 1e-14 

1505 rtol = 1e-13 

1506 tol = atol + rtol * exog.var(0) 

1507 r = np.linalg.qr(exog, mode="r") 

1508 mask = np.abs(r.diagonal()) < np.sqrt(tol) 

1509 exog = exog[:, np.where(~mask)[0]] 

1510 

1511 # calculate test statistic 

1512 sqe = e * e 

1513 sqmndevs = sqe - np.mean(sqe) 

1514 d = np.dot(exog.T, sqmndevs) 

1515 devx = exog - np.mean(exog, axis=0) 

1516 devx *= sqmndevs[:, None] 

1517 b = devx.T.dot(devx) 

1518 stat = d.dot(np.linalg.solve(b, d)) 

1519 

1520 # chi-square test 

1521 dof = devx.shape[1] 

1522 pval = stats.chi2.sf(stat, dof) 

1523 return stat, pval, dof 

1524 

1525 

1526@deprecate_kwarg("olsresults", "res") 

1527def recursive_olsresiduals(res, skip=None, lamda=0.0, alpha=0.95, 

1528 order_by=None): 

1529 """ 

1530 Calculate recursive ols with residuals and Cusum test statistic 

1531 

1532 Parameters 

1533 ---------- 

1534 res : RegressionResults 

1535 Results from estimation of a regression model. 

1536 skip : int, default None 

1537 The number of observations to use for initial OLS, if None then skip is 

1538 set equal to the number of regressors (columns in exog). 

1539 lamda : float, default 0.0 

1540 The weight for Ridge correction to initial (X'X)^{-1}. 

1541 alpha : {0.90, 0.95, 0.99}, default 0.95 

1542 Confidence level of test, currently only two values supported, 

1543 used for confidence interval in cusum graph. 

1544 order_by : array_like, default None 

1545 Integer array specifying the order of the residuals. If not provided, 

1546 the order of the residuals is not changed. If provided, must have 

1547 the same number of observations as the endogenous variable. 

1548 

1549 Returns 

1550 ------- 

1551 rresid : ndarray 

1552 The recursive ols residuals. 

1553 rparams : ndarray 

1554 The recursive ols parameter estimates. 

1555 rypred : ndarray 

1556 The recursive prediction of endogenous variable. 

1557 rresid_standardized : ndarray 

1558 The recursive residuals standardized so that N(0,sigma2) distributed, 

1559 where sigma2 is the error variance. 

1560 rresid_scaled : ndarray 

1561 The recursive residuals normalize so that N(0,1) distributed. 

1562 rcusum : ndarray 

1563 The cumulative residuals for cusum test. 

1564 rcusumci : ndarray 

1565 The confidence interval for cusum test using a size of alpha. 

1566 

1567 Notes 

1568 ----- 

1569 It produces same recursive residuals as other version. This version updates 

1570 the inverse of the X'X matrix and does not require matrix inversion during 

1571 updating. looks efficient but no timing 

1572 

1573 Confidence interval in Greene and Brown, Durbin and Evans is the same as 

1574 in Ploberger after a little bit of algebra. 

1575 

1576 References 

1577 ---------- 

1578 jplv to check formulas, follows Harvey 

1579 BigJudge 5.5.2b for formula for inverse(X'X) updating 

1580 Greene section 7.5.2 

1581 

1582 Brown, R. L., J. Durbin, and J. M. Evans. “Techniques for Testing the 

1583 Constancy of Regression Relationships over Time.” 

1584 Journal of the Royal Statistical Society. Series B (Methodological) 37, 

1585 no. 2 (1975): 149-192. 

1586 """ 

1587 y = res.model.endog 

1588 x = res.model.exog 

1589 order_by = array_like(order_by, "order_by", dtype="int", optional=True, 

1590 ndim=1, shape=(y.shape[0],)) 

1591 # intialize with skip observations 

1592 if order_by is not None: 

1593 x = x[order_by] 

1594 y = y[order_by] 

1595 

1596 nobs, nvars = x.shape 

1597 if skip is None: 

1598 skip = nvars 

1599 rparams = np.nan * np.zeros((nobs, nvars)) 

1600 rresid = np.nan * np.zeros(nobs) 

1601 rypred = np.nan * np.zeros(nobs) 

1602 rvarraw = np.nan * np.zeros(nobs) 

1603 

1604 x0 = x[:skip] 

1605 y0 = y[:skip] 

1606 # add Ridge to start (not in jplv 

1607 xtxi = np.linalg.inv(np.dot(x0.T, x0) + lamda * np.eye(nvars)) 

1608 xty = np.dot(x0.T, y0) # xi * y #np.dot(xi, y) 

1609 beta = np.dot(xtxi, xty) 

1610 rparams[skip - 1] = beta 

1611 yipred = np.dot(x[skip - 1], beta) 

1612 rypred[skip - 1] = yipred 

1613 rresid[skip - 1] = y[skip - 1] - yipred 

1614 rvarraw[skip - 1] = 1 + np.dot(x[skip - 1], np.dot(xtxi, x[skip - 1])) 

1615 for i in range(skip, nobs): 

1616 xi = x[i:i + 1, :] 

1617 yi = y[i] 

1618 

1619 # get prediction error with previous beta 

1620 yipred = np.dot(xi, beta) 

1621 rypred[i] = yipred 

1622 residi = yi - yipred 

1623 rresid[i] = residi 

1624 

1625 # update beta and inverse(X'X) 

1626 tmp = np.dot(xtxi, xi.T) 

1627 ft = 1 + np.dot(xi, tmp) 

1628 

1629 xtxi = xtxi - np.dot(tmp, tmp.T) / ft # BigJudge equ 5.5.15 

1630 

1631 beta = beta + (tmp * residi / ft).ravel() # BigJudge equ 5.5.14 

1632 rparams[i] = beta 

1633 rvarraw[i] = ft 

1634 

1635 rresid_scaled = rresid / np.sqrt(rvarraw) # N(0,sigma2) distributed 

1636 nrr = nobs - skip 

1637 # sigma2 = rresid_scaled[skip-1:].var(ddof=1) #var or sum of squares ? 

1638 # Greene has var, jplv and Ploberger have sum of squares (Ass.:mean=0) 

1639 # Gretl uses: by reverse engineering matching their numbers 

1640 sigma2 = rresid_scaled[skip:].var(ddof=1) 

1641 rresid_standardized = rresid_scaled / np.sqrt(sigma2) # N(0,1) distributed 

1642 rcusum = rresid_standardized[skip - 1:].cumsum() 

1643 # confidence interval points in Greene p136 looks strange. Cleared up 

1644 # this assumes sum of independent standard normal, which does not take into 

1645 # account that we make many tests at the same time 

1646 if alpha == 0.90: 

1647 a = 0.850 

1648 elif alpha == 0.95: 

1649 a = 0.948 

1650 elif alpha == 0.99: 

1651 a = 1.143 

1652 else: 

1653 raise ValueError("alpha can only be 0.9, 0.95 or 0.99") 

1654 

1655 # following taken from Ploberger, 

1656 # crit = a * np.sqrt(nrr) 

1657 rcusumci = (a * np.sqrt(nrr) + 2 * a * np.arange(0, nobs - skip) / np.sqrt( 

1658 nrr)) * np.array([[-1.], [+1.]]) 

1659 return (rresid, rparams, rypred, rresid_standardized, rresid_scaled, 

1660 rcusum, rcusumci) 

1661 

1662 

1663def breaks_hansen(olsresults): 

1664 """ 

1665 Test for model stability, breaks in parameters for ols, Hansen 1992 

1666 

1667 Parameters 

1668 ---------- 

1669 olsresults : RegressionResults 

1670 Results from estimation of a regression model. 

1671 

1672 Returns 

1673 ------- 

1674 teststat : float 

1675 Hansen's test statistic. 

1676 crit : ndarray 

1677 The critical values at alpha=0.95 for different nvars. 

1678 

1679 Notes 

1680 ----- 

1681 looks good in example, maybe not very powerful for small changes in 

1682 parameters 

1683 

1684 According to Greene, distribution of test statistics depends on nvar but 

1685 not on nobs. 

1686 

1687 Test statistic is verified against R:strucchange 

1688 

1689 References 

1690 ---------- 

1691 Greene section 7.5.1, notation follows Greene 

1692 """ 

1693 x = olsresults.model.exog 

1694 resid = array_like(olsresults.resid, "resid", shape=(x.shape[0], 1)) 

1695 nobs, nvars = x.shape 

1696 resid2 = resid ** 2 

1697 ft = np.c_[x * resid[:, None], (resid2 - resid2.mean())] 

1698 score = ft.cumsum(0) 

1699 f = nobs * (ft[:, :, None] * ft[:, None, :]).sum(0) 

1700 s = (score[:, :, None] * score[:, None, :]).sum(0) 

1701 h = np.trace(np.dot(np.linalg.inv(f), s)) 

1702 crit95 = np.array([(2, 1.9), (6, 3.75), (15, 3.75), (19, 4.52)], 

1703 dtype=[("nobs", int), ("crit", float)]) 

1704 # TODO: get critical values from Bruce Hansen's 1992 paper 

1705 return h, crit95 

1706 

1707 

1708def breaks_cusumolsresid(resid, ddof=0): 

1709 """ 

1710 Cusum test for parameter stability based on ols residuals. 

1711 

1712 Parameters 

1713 ---------- 

1714 resid : ndarray 

1715 An array of residuals from an OLS estimation. 

1716 ddof : int 

1717 The number of parameters in the OLS estimation, used as degrees 

1718 of freedom correction for error variance. 

1719 

1720 Returns 

1721 ------- 

1722 sup_b : float 

1723 The test statistic, maximum of absolute value of scaled cumulative OLS 

1724 residuals. 

1725 pval : float 

1726 Probability of observing the data under the null hypothesis of no 

1727 structural change, based on asymptotic distribution which is a Brownian 

1728 Bridge 

1729 crit: list 

1730 The tabulated critical values, for alpha = 1%, 5% and 10%. 

1731 

1732 Notes 

1733 ----- 

1734 Tested against R:structchange. 

1735 

1736 Not clear: Assumption 2 in Ploberger, Kramer assumes that exog x have 

1737 asymptotically zero mean, x.mean(0) = [1, 0, 0, ..., 0] 

1738 Is this really necessary? I do not see how it can affect the test statistic 

1739 under the null. It does make a difference under the alternative. 

1740 Also, the asymptotic distribution of test statistic depends on this. 

1741 

1742 From examples it looks like there is little power for standard cusum if 

1743 exog (other than constant) have mean zero. 

1744 

1745 References 

1746 ---------- 

1747 Ploberger, Werner, and Walter Kramer. “The Cusum Test with OLS Residuals.” 

1748 Econometrica 60, no. 2 (March 1992): 271-285. 

1749 """ 

1750 resid = resid.ravel() 

1751 nobs = len(resid) 

1752 nobssigma2 = (resid ** 2).sum() 

1753 if ddof > 0: 

1754 nobssigma2 = nobssigma2 / (nobs - ddof) * nobs 

1755 # b is asymptotically a Brownian Bridge 

1756 b = resid.cumsum() / np.sqrt(nobssigma2) # use T*sigma directly 

1757 # asymptotically distributed as standard Brownian Bridge 

1758 sup_b = np.abs(b).max() 

1759 crit = [(1, 1.63), (5, 1.36), (10, 1.22)] 

1760 # Note stats.kstwobign.isf(0.1) is distribution of sup.abs of Brownian 

1761 # Bridge 

1762 # >>> stats.kstwobign.isf([0.01,0.05,0.1]) 

1763 # array([ 1.62762361, 1.35809864, 1.22384787]) 

1764 pval = stats.kstwobign.sf(sup_b) 

1765 return sup_b, pval, crit 

1766 

1767# def breaks_cusum(recolsresid): 

1768# """renormalized cusum test for parameter stability based on recursive 

1769# residuals 

1770# 

1771# 

1772# still incorrect: in PK, the normalization for sigma is by T not T-K 

1773# also the test statistic is asymptotically a Wiener Process, Brownian 

1774# motion 

1775# not Brownian Bridge 

1776# for testing: result reject should be identical as in standard cusum 

1777# version 

1778# 

1779# References 

1780# ---------- 

1781# Ploberger, Werner, and Walter Kramer. “The Cusum Test with OLS Residuals.” 

1782# Econometrica 60, no. 2 (March 1992): 271-285. 

1783# 

1784# """ 

1785# resid = recolsresid.ravel() 

1786# nobssigma2 = (resid**2).sum() 

1787# #B is asymptotically a Brownian Bridge 

1788# B = resid.cumsum()/np.sqrt(nobssigma2) # use T*sigma directly 

1789# nobs = len(resid) 

1790# denom = 1. + 2. * np.arange(nobs)/(nobs-1.) #not sure about limits 

1791# sup_b = np.abs(B/denom).max() 

1792# #asymptotically distributed as standard Brownian Bridge 

1793# crit = [(1,1.63), (5, 1.36), (10, 1.22)] 

1794# #Note stats.kstwobign.isf(0.1) is distribution of sup.abs of Brownian 

1795# Bridge 

1796# #>>> stats.kstwobign.isf([0.01,0.05,0.1]) 

1797# #array([ 1.62762361, 1.35809864, 1.22384787]) 

1798# pval = stats.kstwobign.sf(sup_b) 

1799# return sup_b, pval, crit