Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1import numpy as np 

2from scipy.stats import f as fdist 

3from scipy.stats import t as student_t 

4from scipy import stats 

5from statsmodels.tools.tools import clean0, fullrank 

6from statsmodels.stats.multitest import multipletests 

7 

8 

9#TODO: should this be public if it's just a container? 

10class ContrastResults(object): 

11 """ 

12 Class for results of tests of linear restrictions on coefficients in a model. 

13 

14 This class functions mainly as a container for `t_test`, `f_test` and 

15 `wald_test` for the parameters of a model. 

16 

17 The attributes depend on the statistical test and are either based on the 

18 normal, the t, the F or the chisquare distribution. 

19 """ 

20 

21 def __init__(self, t=None, F=None, sd=None, effect=None, df_denom=None, 

22 df_num=None, alpha=0.05, **kwds): 

23 

24 self.effect = effect # Let it be None for F 

25 if F is not None: 

26 self.distribution = 'F' 

27 self.fvalue = F 

28 self.statistic = self.fvalue 

29 self.df_denom = df_denom 

30 self.df_num = df_num 

31 self.dist = fdist 

32 self.dist_args = (df_num, df_denom) 

33 self.pvalue = fdist.sf(F, df_num, df_denom) 

34 elif t is not None: 

35 self.distribution = 't' 

36 self.tvalue = t 

37 self.statistic = t # generic alias 

38 self.sd = sd 

39 self.df_denom = df_denom 

40 self.dist = student_t 

41 self.dist_args = (df_denom,) 

42 self.pvalue = self.dist.sf(np.abs(t), df_denom) * 2 

43 elif 'statistic' in kwds: 

44 # TODO: currently targeted to normal distribution, and chi2 

45 self.distribution = kwds['distribution'] 

46 self.statistic = kwds['statistic'] 

47 self.tvalue = value = kwds['statistic'] # keep alias 

48 # TODO: for results instance we decided to use tvalues also for normal 

49 self.sd = sd 

50 self.dist = getattr(stats, self.distribution) 

51 self.dist_args = kwds.get('dist_args', ()) 

52 if self.distribution == 'chi2': 

53 self.pvalue = self.dist.sf(self.statistic, df_denom) 

54 self.df_denom = df_denom 

55 else: 

56 "normal" 

57 self.pvalue = np.full_like(value, np.nan) 

58 not_nan = ~np.isnan(value) 

59 self.pvalue[not_nan] = self.dist.sf(np.abs(value[not_nan])) * 2 

60 

61 # cleanup 

62 # should we return python scalar? 

63 self.pvalue = np.squeeze(self.pvalue) 

64 

65 

66 def conf_int(self, alpha=0.05): 

67 """ 

68 Returns the confidence interval of the value, `effect` of the constraint. 

69 

70 This is currently only available for t and z tests. 

71 

72 Parameters 

73 ---------- 

74 alpha : float, optional 

75 The significance level for the confidence interval. 

76 ie., The default `alpha` = .05 returns a 95% confidence interval. 

77 

78 Returns 

79 ------- 

80 ci : ndarray, (k_constraints, 2) 

81 The array has the lower and the upper limit of the confidence 

82 interval in the columns. 

83 """ 

84 if self.effect is not None: 

85 # confidence intervals 

86 q = self.dist.ppf(1 - alpha / 2., *self.dist_args) 

87 lower = self.effect - q * self.sd 

88 upper = self.effect + q * self.sd 

89 return np.column_stack((lower, upper)) 

90 else: 

91 raise NotImplementedError('Confidence Interval not available') 

92 

93 def __str__(self): 

94 return self.summary().__str__() 

95 

96 def __repr__(self): 

97 return str(self.__class__) + '\n' + self.__str__() 

98 

99 def summary(self, xname=None, alpha=0.05, title=None): 

100 """Summarize the Results of the hypothesis test 

101 

102 Parameters 

103 ---------- 

104 xname : list[str], optional 

105 Default is `c_##` for ## in the number of regressors 

106 alpha : float 

107 significance level for the confidence intervals. Default is 

108 alpha = 0.05 which implies a confidence level of 95%. 

109 title : str, optional 

110 Title for the params table. If not None, then this replaces the 

111 default title 

112 

113 Returns 

114 ------- 

115 smry : str or Summary instance 

116 This contains a parameter results table in the case of t or z test 

117 in the same form as the parameter results table in the model 

118 results summary. 

119 For F or Wald test, the return is a string. 

120 """ 

121 if self.effect is not None: 

122 # TODO: should also add some extra information, e.g. robust cov ? 

123 # TODO: can we infer names for constraints, xname in __init__ ? 

124 if title is None: 

125 title = 'Test for Constraints' 

126 elif title == '': 

127 # do not add any title, 

128 # I think SimpleTable skips on None - check 

129 title = None 

130 # we have everything for a params table 

131 use_t = (self.distribution == 't') 

132 yname='constraints' # Not used in params_frame 

133 if xname is None: 

134 xname = ['c%d' % ii for ii in range(len(self.effect))] 

135 from statsmodels.iolib.summary import summary_params 

136 pvalues = np.atleast_1d(self.pvalue) 

137 summ = summary_params((self, self.effect, self.sd, self.statistic, 

138 pvalues, self.conf_int(alpha)), 

139 yname=yname, xname=xname, use_t=use_t, 

140 title=title, alpha=alpha) 

141 return summ 

142 elif hasattr(self, 'fvalue'): 

143 # TODO: create something nicer for these casee 

144 return ('<F test: F=%s, p=%s, df_denom=%.3g, df_num=%.3g>' % 

145 (repr(self.fvalue), self.pvalue, self.df_denom, 

146 self.df_num)) 

147 elif self.distribution == 'chi2': 

148 return ('<Wald test (%s): statistic=%s, p-value=%s, df_denom=%.3g>' % 

149 (self.distribution, self.statistic, self.pvalue, 

150 self.df_denom)) 

151 else: 

152 # generic 

153 return ('<Wald test: statistic=%s, p-value=%s>' % 

154 (self.statistic, self.pvalue)) 

155 

156 

157 def summary_frame(self, xname=None, alpha=0.05): 

158 """Return the parameter table as a pandas DataFrame 

159 

160 This is only available for t and normal tests 

161 """ 

162 if self.effect is not None: 

163 # we have everything for a params table 

164 use_t = (self.distribution == 't') 

165 yname='constraints' # Not used in params_frame 

166 if xname is None: 

167 xname = ['c%d' % ii for ii in range(len(self.effect))] 

168 from statsmodels.iolib.summary import summary_params_frame 

169 summ = summary_params_frame((self, self.effect, self.sd, 

170 self.statistic,self.pvalue, 

171 self.conf_int(alpha)), yname=yname, 

172 xname=xname, use_t=use_t, 

173 alpha=alpha) 

174 return summ 

175 else: 

176 # TODO: create something nicer 

177 raise NotImplementedError('only available for t and z') 

178 

179 

180 

181class Contrast(object): 

182 """ 

183 This class is used to construct contrast matrices in regression models. 

184 

185 They are specified by a (term, design) pair. The term, T, is a linear 

186 combination of columns of the design matrix. The matrix attribute of 

187 Contrast is a contrast matrix C so that 

188 

189 colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T))) 

190 

191 where pinv(D) is the generalized inverse of D. Further, the matrix 

192 

193 Tnew = dot(C, D) 

194 

195 is full rank. The rank attribute is the rank of 

196 

197 dot(D, dot(pinv(D), T)) 

198 

199 In a regression model, the contrast tests that E(dot(Tnew, Y)) = 0 

200 for each column of Tnew. 

201 

202 Parameters 

203 ---------- 

204 term : array_like 

205 design : array_like 

206 

207 Attributes 

208 ---------- 

209 contrast_matrix 

210 

211 Examples 

212 -------- 

213 >>> import statsmodels.api as sm 

214 >>> from statsmodels.stats.contrast import Contrast 

215 >>> import numpy as np 

216 >>> np.random.seed(54321) 

217 >>> X = np.random.standard_normal((40,10)) 

218 # Get a contrast 

219 >>> new_term = np.column_stack((X[:,0], X[:,2])) 

220 >>> c = Contrast(new_term, X) 

221 >>> test = [[1] + [0]*9, [0]*2 + [1] + [0]*7] 

222 >>> np.allclose(c.contrast_matrix, test) 

223 True 

224 

225 Get another contrast 

226 

227 >>> P = np.dot(X, np.linalg.pinv(X)) 

228 >>> resid = np.identity(40) - P 

229 >>> noise = np.dot(resid,np.random.standard_normal((40,5))) 

230 >>> new_term2 = np.column_stack((noise,X[:,2])) 

231 >>> c2 = Contrast(new_term2, X) 

232 >>> print(c2.contrast_matrix) 

233 [ -1.26424750e-16 8.59467391e-17 1.56384718e-01 -2.60875560e-17 

234 -7.77260726e-17 -8.41929574e-18 -7.36359622e-17 -1.39760860e-16 

235 1.82976904e-16 -3.75277947e-18] 

236 

237 Get another contrast 

238 

239 >>> zero = np.zeros((40,)) 

240 >>> new_term3 = np.column_stack((zero,X[:,2])) 

241 >>> c3 = Contrast(new_term3, X) 

242 >>> test2 = [0]*2 + [1] + [0]*7 

243 >>> np.allclose(c3.contrast_matrix, test2) 

244 True 

245 """ 

246 def _get_matrix(self): 

247 """ 

248 Gets the contrast_matrix property 

249 """ 

250 if not hasattr(self, "_contrast_matrix"): 

251 self.compute_matrix() 

252 return self._contrast_matrix 

253 

254 contrast_matrix = property(_get_matrix) 

255 

256 def __init__(self, term, design): 

257 self.term = np.asarray(term) 

258 self.design = np.asarray(design) 

259 

260 def compute_matrix(self): 

261 """ 

262 Construct a contrast matrix C so that 

263 

264 colspan(dot(D, C)) = colspan(dot(D, dot(pinv(D), T))) 

265 

266 where pinv(D) is the generalized inverse of D=design. 

267 """ 

268 

269 T = self.term 

270 if T.ndim == 1: 

271 T = T[:,None] 

272 

273 self.T = clean0(T) 

274 self.D = self.design 

275 self._contrast_matrix = contrastfromcols(self.T, self.D) 

276 try: 

277 self.rank = self.matrix.shape[1] 

278 except: 

279 self.rank = 1 

280 

281#TODO: fix docstring after usage is settled 

282def contrastfromcols(L, D, pseudo=None): 

283 """ 

284 From an n x p design matrix D and a matrix L, tries 

285 to determine a p x q contrast matrix C which 

286 determines a contrast of full rank, i.e. the 

287 n x q matrix 

288 

289 dot(transpose(C), pinv(D)) 

290 

291 is full rank. 

292 

293 L must satisfy either L.shape[0] == n or L.shape[1] == p. 

294 

295 If L.shape[0] == n, then L is thought of as representing 

296 columns in the column space of D. 

297 

298 If L.shape[1] == p, then L is thought of as what is known 

299 as a contrast matrix. In this case, this function returns an estimable 

300 contrast corresponding to the dot(D, L.T) 

301 

302 Note that this always produces a meaningful contrast, not always 

303 with the intended properties because q is always non-zero unless 

304 L is identically 0. That is, it produces a contrast that spans 

305 the column space of L (after projection onto the column space of D). 

306 

307 Parameters 

308 ---------- 

309 L : array_like 

310 D : array_like 

311 """ 

312 L = np.asarray(L) 

313 D = np.asarray(D) 

314 

315 n, p = D.shape 

316 

317 if L.shape[0] != n and L.shape[1] != p: 

318 raise ValueError("shape of L and D mismatched") 

319 

320 if pseudo is None: 

321 pseudo = np.linalg.pinv(D) # D^+ \approx= ((dot(D.T,D))^(-1),D.T) 

322 

323 if L.shape[0] == n: 

324 C = np.dot(pseudo, L).T 

325 else: 

326 C = L 

327 C = np.dot(pseudo, np.dot(D, C.T)).T 

328 

329 Lp = np.dot(D, C.T) 

330 

331 if len(Lp.shape) == 1: 

332 Lp.shape = (n, 1) 

333 

334 if np.linalg.matrix_rank(Lp) != Lp.shape[1]: 

335 Lp = fullrank(Lp) 

336 C = np.dot(pseudo, Lp).T 

337 

338 return np.squeeze(C) 

339 

340 

341# TODO: this is currently a minimal version, stub 

342class WaldTestResults(object): 

343 # for F and chi2 tests of joint hypothesis, mainly for vectorized 

344 

345 def __init__(self, statistic, distribution, dist_args, table=None, 

346 pvalues=None): 

347 self.table = table 

348 

349 self.distribution = distribution 

350 self.statistic = statistic 

351 #self.sd = sd 

352 self.dist_args = dist_args 

353 

354 # The following is because I do not know which we want 

355 if table is not None: 

356 self.statistic = table['statistic'].values 

357 self.pvalues = table['pvalue'].values 

358 self.df_constraints = table['df_constraint'].values 

359 if self.distribution == 'F': 

360 self.df_denom = table['df_denom'].values 

361 

362 else: 

363 if self.distribution == 'chi2': 

364 self.dist = stats.chi2 

365 self.df_constraints = self.dist_args[0] # assumes tuple 

366 # using dist_args[0] is a bit dangerous, 

367 elif self.distribution == 'F': 

368 self.dist = stats.f 

369 self.df_constraints, self.df_denom = self.dist_args 

370 

371 else: 

372 raise ValueError('only F and chi2 are possible distribution') 

373 

374 if pvalues is None: 

375 self.pvalues = self.dist.sf(np.abs(statistic), *dist_args) 

376 else: 

377 self.pvalues = pvalues 

378 

379 @property 

380 def col_names(self): 

381 """column names for summary table 

382 """ 

383 

384 pr_test = "P>%s" % self.distribution 

385 col_names = [self.distribution, pr_test, 'df constraint'] 

386 if self.distribution == 'F': 

387 col_names.append('df denom') 

388 return col_names 

389 

390 def summary_frame(self): 

391 # needs to be a method for consistency 

392 if hasattr(self, '_dframe'): 

393 return self._dframe 

394 # rename the column nambes, but do not copy data 

395 renaming = dict(zip(self.table.columns, self.col_names)) 

396 self.dframe = self.table.rename(columns=renaming) 

397 return self.dframe 

398 

399 

400 def __str__(self): 

401 return self.summary_frame().to_string() 

402 

403 

404 def __repr__(self): 

405 return str(self.__class__) + '\n' + self.__str__() 

406 

407 

408# t_test for pairwise comparison and automatic contrast/restrictions 

409 

410 

411def _get_pairs_labels(k_level, level_names): 

412 """helper function for labels for pairwise comparisons 

413 """ 

414 idx_pairs_all = np.triu_indices(k_level, 1) 

415 labels = ['%s-%s' % (level_names[name[1]], level_names[name[0]]) 

416 for name in zip(*idx_pairs_all)] 

417 return labels 

418 

419def _contrast_pairs(k_params, k_level, idx_start): 

420 """create pairwise contrast for reference coding 

421 

422 currently not used, 

423 using encoding contrast matrix is more general, but requires requires 

424 factor information from patsy design_info. 

425 

426 

427 Parameters 

428 ---------- 

429 k_params : int 

430 number of parameters 

431 k_level : int 

432 number of levels or categories (including reference case) 

433 idx_start : int 

434 Index of the first parameter of this factor. The restrictions on the 

435 factor are inserted as a block in the full restriction matrix starting 

436 at column with index `idx_start`. 

437 

438 Returns 

439 ------- 

440 contrasts : ndarray 

441 restriction matrix with k_params columns and number of rows equal to 

442 the number of restrictions. 

443 """ 

444 k_level_m1 = k_level - 1 

445 idx_pairs = np.triu_indices(k_level_m1, 1) 

446 

447 k = len(idx_pairs[0]) 

448 c_pairs = np.zeros((k, k_level_m1)) 

449 c_pairs[np.arange(k), idx_pairs[0]] = -1 

450 c_pairs[np.arange(k), idx_pairs[1]] = 1 

451 c_reference = np.eye(k_level_m1) 

452 c = np.concatenate((c_reference, c_pairs), axis=0) 

453 k_all = c.shape[0] 

454 

455 contrasts = np.zeros((k_all, k_params)) 

456 contrasts[:, idx_start : idx_start + k_level_m1] = c 

457 

458 return contrasts 

459 

460 

461def t_test_multi(result, contrasts, method='hs', alpha=0.05, ci_method=None, 

462 contrast_names=None): 

463 """perform t_test and add multiplicity correction to results dataframe 

464 

465 Parameters 

466 ---------- 

467 result results instance 

468 results of an estimated model 

469 contrasts : ndarray 

470 restriction matrix for t_test 

471 method : str or list of strings 

472 method for multiple testing p-value correction, default is'hs'. 

473 alpha : float 

474 significance level for multiple testing reject decision. 

475 ci_method : None 

476 not used yet, will be for multiplicity corrected confidence intervals 

477 contrast_names : {list[str], None} 

478 If contrast_names are provided, then they are used in the index of the 

479 returned dataframe, otherwise some generic default names are created. 

480 

481 Returns 

482 ------- 

483 res_df : pandas DataFrame 

484 The dataframe contains the results of the t_test and additional columns 

485 for multiplicity corrected p-values and boolean indicator for whether 

486 the Null hypothesis is rejected. 

487 """ 

488 tt = result.t_test(contrasts) 

489 res_df = tt.summary_frame(xname=contrast_names) 

490 

491 if type(method) is not list: 

492 method = [method] 

493 for meth in method: 

494 mt = multipletests(tt.pvalue, method=meth, alpha=alpha) 

495 res_df['pvalue-%s' % meth] = mt[1] 

496 res_df['reject-%s' % meth] = mt[0] 

497 return res_df 

498 

499 

500class MultiCompResult(object): 

501 """class to hold return of t_test_pairwise 

502 

503 currently just a minimal class to hold attributes. 

504 """ 

505 def __init__(self, **kwargs): 

506 self.__dict__.update(kwargs) 

507 

508 

509def _embed_constraints(contrasts, k_params, idx_start, index=None): 

510 """helper function to expand constraints to a full restriction matrix 

511 

512 Parameters 

513 ---------- 

514 contrasts : ndarray 

515 restriction matrix for t_test 

516 k_params : int 

517 number of parameters 

518 idx_start : int 

519 Index of the first parameter of this factor. The restrictions on the 

520 factor are inserted as a block in the full restriction matrix starting 

521 at column with index `idx_start`. 

522 index : slice or ndarray 

523 Column index if constraints do not form a block in the full restriction 

524 matrix, i.e. if parameters that are subject to restrictions are not 

525 consecutive in the list of parameters. 

526 If index is not None, then idx_start is ignored. 

527 

528 Returns 

529 ------- 

530 contrasts : ndarray 

531 restriction matrix with k_params columns and number of rows equal to 

532 the number of restrictions. 

533 """ 

534 

535 k_c, k_p = contrasts.shape 

536 c = np.zeros((k_c, k_params)) 

537 if index is None: 

538 c[:, idx_start : idx_start + k_p] = contrasts 

539 else: 

540 c[:, index] = contrasts 

541 return c 

542 

543 

544def _constraints_factor(encoding_matrix, comparison='pairwise', k_params=None, 

545 idx_start=None): 

546 """helper function to create constraints based on encoding matrix 

547 

548 Parameters 

549 ---------- 

550 encoding_matrix : ndarray 

551 contrast matrix for the encoding of a factor as defined by patsy. 

552 The number of rows should be equal to the number of levels or categories 

553 of the factor, the number of columns should be equal to the number 

554 of parameters for this factor. 

555 comparison : str 

556 Currently only 'pairwise' is implemented. The restriction matrix 

557 can be used for testing the hypothesis that all pairwise differences 

558 are zero. 

559 k_params : int 

560 number of parameters 

561 idx_start : int 

562 Index of the first parameter of this factor. The restrictions on the 

563 factor are inserted as a block in the full restriction matrix starting 

564 at column with index `idx_start`. 

565 

566 Returns 

567 ------- 

568 contrast : ndarray 

569 Contrast or restriction matrix that can be used in hypothesis test 

570 of model results. The number of columns is k_params. 

571 """ 

572 

573 cm = encoding_matrix 

574 k_level, k_p = cm.shape 

575 

576 import statsmodels.sandbox.stats.multicomp as mc 

577 if comparison in ['pairwise', 'pw', 'pairs']: 

578 c_all = -mc.contrast_allpairs(k_level) 

579 else: 

580 raise NotImplementedError('currentlyonly pairwise comparison') 

581 

582 contrasts = c_all.dot(cm) 

583 if k_params is not None: 

584 if idx_start is None: 

585 raise ValueError("if k_params is not None, then idx_start is " 

586 "required") 

587 contrasts = _embed_constraints(contrasts, k_params, idx_start) 

588 return contrasts 

589 

590 

591def t_test_pairwise(result, term_name, method='hs', alpha=0.05, 

592 factor_labels=None, ignore=False): 

593 """ 

594 Perform pairwise t_test with multiple testing corrected p-values. 

595 

596 This uses the formula design_info encoding contrast matrix and should 

597 work for all encodings of a main effect. 

598 

599 Parameters 

600 ---------- 

601 result : result instance 

602 The results of an estimated model with a categorical main effect. 

603 term_name : str 

604 name of the term for which pairwise comparisons are computed. 

605 Term names for categorical effects are created by patsy and 

606 correspond to the main part of the exog names. 

607 method : {str, list[str]} 

608 multiple testing p-value correction, default is 'hs', 

609 see stats.multipletesting 

610 alpha : float 

611 significance level for multiple testing reject decision. 

612 factor_labels : {list[str], None} 

613 Labels for the factor levels used for pairwise labels. If not 

614 provided, then the labels from the formula design_info are used. 

615 ignore : bool 

616 Turn off some of the exceptions raised by input checks. 

617 

618 Returns 

619 ------- 

620 MultiCompResult 

621 The results are stored as attributes, the main attributes are the 

622 following two. Other attributes are added for debugging purposes 

623 or as background information. 

624 

625 - result_frame : pandas DataFrame with t_test results and multiple 

626 testing corrected p-values. 

627 - contrasts : matrix of constraints of the null hypothesis in the 

628 t_test. 

629 

630 Notes 

631 ----- 

632 

633 Status: experimental. Currently only checked for treatment coding with 

634 and without specified reference level. 

635 

636 Currently there are no multiple testing corrected confidence intervals 

637 available. 

638 """ 

639 

640 desinfo = result.model.data.design_info 

641 term_idx = desinfo.term_names.index(term_name) 

642 term = desinfo.terms[term_idx] 

643 idx_start = desinfo.term_slices[term].start 

644 if not ignore and len(term.factors) > 1: 

645 raise ValueError('interaction effects not yet supported') 

646 factor = term.factors[0] 

647 cat = desinfo.factor_infos[factor].categories 

648 if factor_labels is not None: 

649 if len(factor_labels) == len(cat): 

650 cat = factor_labels 

651 else: 

652 raise ValueError("factor_labels has the wrong length, should be %d" % len(cat)) 

653 

654 

655 k_level = len(cat) 

656 cm = desinfo.term_codings[term][0].contrast_matrices[factor].matrix 

657 

658 k_params = len(result.params) 

659 labels = _get_pairs_labels(k_level, cat) 

660 

661 import statsmodels.sandbox.stats.multicomp as mc 

662 c_all_pairs = -mc.contrast_allpairs(k_level) 

663 contrasts_sub = c_all_pairs.dot(cm) 

664 contrasts = _embed_constraints(contrasts_sub, k_params, idx_start) 

665 res_df = t_test_multi(result, contrasts, method=method, ci_method=None, 

666 alpha=alpha, contrast_names=labels) 

667 res = MultiCompResult(result_frame=res_df, 

668 contrasts=contrasts, 

669 term=term, 

670 contrast_labels=labels, 

671 term_encoding_matrix=cm) 

672 return res