Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Procedures for fitting marginal regression models to dependent data 

3using Generalized Estimating Equations. 

4 

5References 

6---------- 

7KY Liang and S Zeger. "Longitudinal data analysis using 

8generalized linear models". Biometrika (1986) 73 (1): 13-22. 

9 

10S Zeger and KY Liang. "Longitudinal Data Analysis for Discrete and 

11Continuous Outcomes". Biometrics Vol. 42, No. 1 (Mar., 1986), 

12pp. 121-130 

13 

14A Rotnitzky and NP Jewell (1990). "Hypothesis testing of regression 

15parameters in semiparametric generalized linear models for cluster 

16correlated data", Biometrika, 77, 485-497. 

17 

18Xu Guo and Wei Pan (2002). "Small sample performance of the score 

19test in GEE". 

20http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf 

21 

22LA Mancl LA, TA DeRouen (2001). A covariance estimator for GEE with 

23improved small-sample properties. Biometrics. 2001 Mar;57(1):126-34. 

24""" 

25from statsmodels.compat.python import lzip 

26from statsmodels.compat.pandas import Appender 

27 

28import numpy as np 

29from scipy import stats 

30import pandas as pd 

31import patsy 

32from collections import defaultdict 

33from statsmodels.tools.decorators import cache_readonly 

34import statsmodels.base.model as base 

35# used for wrapper: 

36import statsmodels.regression.linear_model as lm 

37import statsmodels.base.wrapper as wrap 

38 

39from statsmodels.genmod import families 

40from statsmodels.genmod.generalized_linear_model import GLM 

41from statsmodels.genmod import cov_struct as cov_structs 

42 

43import statsmodels.genmod.families.varfuncs as varfuncs 

44from statsmodels.genmod.families.links import Link 

45 

46from statsmodels.tools.sm_exceptions import (ConvergenceWarning, 

47 DomainWarning, 

48 IterationLimitWarning, 

49 ValueWarning) 

50import warnings 

51 

52from statsmodels.graphics._regressionplots_doc import ( 

53 _plot_added_variable_doc, 

54 _plot_partial_residuals_doc, 

55 _plot_ceres_residuals_doc) 

56from statsmodels.discrete.discrete_margins import ( 

57 _get_margeff_exog, _check_margeff_args, _effects_at, margeff_cov_with_se, 

58 _check_at_is_all, _transform_names, _check_discrete_args, 

59 _get_dummy_index, _get_count_index) 

60 

61 

62class ParameterConstraint(object): 

63 """ 

64 A class for managing linear equality constraints for a parameter 

65 vector. 

66 """ 

67 

68 def __init__(self, lhs, rhs, exog): 

69 """ 

70 Parameters 

71 ---------- 

72 lhs : ndarray 

73 A q x p matrix which is the left hand side of the 

74 constraint lhs * param = rhs. The number of constraints is 

75 q >= 1 and p is the dimension of the parameter vector. 

76 rhs : ndarray 

77 A 1-dimensional vector of length q which is the right hand 

78 side of the constraint equation. 

79 exog : ndarray 

80 The n x p exognenous data for the full model. 

81 """ 

82 

83 # In case a row or column vector is passed (patsy linear 

84 # constraints passes a column vector). 

85 rhs = np.atleast_1d(rhs.squeeze()) 

86 

87 if rhs.ndim > 1: 

88 raise ValueError("The right hand side of the constraint " 

89 "must be a vector.") 

90 

91 if len(rhs) != lhs.shape[0]: 

92 raise ValueError("The number of rows of the left hand " 

93 "side constraint matrix L must equal " 

94 "the length of the right hand side " 

95 "constraint vector R.") 

96 

97 self.lhs = lhs 

98 self.rhs = rhs 

99 

100 # The columns of lhs0 are an orthogonal basis for the 

101 # orthogonal complement to row(lhs), the columns of lhs1 are 

102 # an orthogonal basis for row(lhs). The columns of lhsf = 

103 # [lhs0, lhs1] are mutually orthogonal. 

104 lhs_u, lhs_s, lhs_vt = np.linalg.svd(lhs.T, full_matrices=1) 

105 self.lhs0 = lhs_u[:, len(lhs_s):] 

106 self.lhs1 = lhs_u[:, 0:len(lhs_s)] 

107 self.lhsf = np.hstack((self.lhs0, self.lhs1)) 

108 

109 # param0 is one solution to the underdetermined system 

110 # L * param = R. 

111 self.param0 = np.dot(self.lhs1, np.dot(lhs_vt, self.rhs) / 

112 lhs_s) 

113 

114 self._offset_increment = np.dot(exog, self.param0) 

115 

116 self.orig_exog = exog 

117 self.exog_fulltrans = np.dot(exog, self.lhsf) 

118 

119 def offset_increment(self): 

120 """ 

121 Returns a vector that should be added to the offset vector to 

122 accommodate the constraint. 

123 

124 Parameters 

125 ---------- 

126 exog : array_like 

127 The exogeneous data for the model. 

128 """ 

129 

130 return self._offset_increment 

131 

132 def reduced_exog(self): 

133 """ 

134 Returns a linearly transformed exog matrix whose columns span 

135 the constrained model space. 

136 

137 Parameters 

138 ---------- 

139 exog : array_like 

140 The exogeneous data for the model. 

141 """ 

142 return self.exog_fulltrans[:, 0:self.lhs0.shape[1]] 

143 

144 def restore_exog(self): 

145 """ 

146 Returns the full exog matrix before it was reduced to 

147 satisfy the constraint. 

148 """ 

149 return self.orig_exog 

150 

151 def unpack_param(self, params): 

152 """ 

153 Converts the parameter vector `params` from reduced to full 

154 coordinates. 

155 """ 

156 

157 return self.param0 + np.dot(self.lhs0, params) 

158 

159 def unpack_cov(self, bcov): 

160 """ 

161 Converts the covariance matrix `bcov` from reduced to full 

162 coordinates. 

163 """ 

164 

165 return np.dot(self.lhs0, np.dot(bcov, self.lhs0.T)) 

166 

167 

168_gee_init_doc = """ 

169 Marginal regression model fit using Generalized Estimating Equations. 

170 

171 GEE can be used to fit Generalized Linear Models (GLMs) when the 

172 data have a grouped structure, and the observations are possibly 

173 correlated within groups but not between groups. 

174 

175 Parameters 

176 ---------- 

177 endog : array_like 

178 1d array of endogenous values (i.e. responses, outcomes, 

179 dependent variables, or 'Y' values). 

180 exog : array_like 

181 2d array of exogeneous values (i.e. covariates, predictors, 

182 independent variables, regressors, or 'X' values). A `nobs x 

183 k` array where `nobs` is the number of observations and `k` is 

184 the number of regressors. An intercept is not included by 

185 default and should be added by the user. See 

186 `statsmodels.tools.add_constant`. 

187 groups : array_like 

188 A 1d array of length `nobs` containing the group labels. 

189 time : array_like 

190 A 2d array of time (or other index) values, used by some 

191 dependence structures to define similarity relationships among 

192 observations within a cluster. 

193 family : family class instance 

194%(family_doc)s 

195 cov_struct : CovStruct class instance 

196 The default is Independence. To specify an exchangeable 

197 structure use cov_struct = Exchangeable(). See 

198 statsmodels.genmod.cov_struct.CovStruct for more 

199 information. 

200 offset : array_like 

201 An offset to be included in the fit. If provided, must be 

202 an array whose length is the number of rows in exog. 

203 dep_data : array_like 

204 Additional data passed to the dependence structure. 

205 constraint : (ndarray, ndarray) 

206 If provided, the constraint is a tuple (L, R) such that the 

207 model parameters are estimated under the constraint L * 

208 param = R, where L is a q x p matrix and R is a 

209 q-dimensional vector. If constraint is provided, a score 

210 test is performed to compare the constrained model to the 

211 unconstrained model. 

212 update_dep : bool 

213 If true, the dependence parameters are optimized, otherwise 

214 they are held fixed at their starting values. 

215 weights : array_like 

216 An array of weights to use in the analysis. The weights must 

217 be constant within each group. These correspond to 

218 probability weights (pweights) in Stata. 

219 %(extra_params)s 

220 

221 See Also 

222 -------- 

223 statsmodels.genmod.families.family 

224 :ref:`families` 

225 :ref:`links` 

226 

227 Notes 

228 ----- 

229 Only the following combinations make sense for family and link :: 

230 

231 + ident log logit probit cloglog pow opow nbinom loglog logc 

232 Gaussian | x x x 

233 inv Gaussian | x x x 

234 binomial | x x x x x x x x x 

235 Poisson | x x x 

236 neg binomial | x x x x 

237 gamma | x x x 

238 

239 Not all of these link functions are currently available. 

240 

241 Endog and exog are references so that if the data they refer 

242 to are already arrays and these arrays are changed, endog and 

243 exog will change. 

244 

245 The "robust" covariance type is the standard "sandwich estimator" 

246 (e.g. Liang and Zeger (1986)). It is the default here and in most 

247 other packages. The "naive" estimator gives smaller standard 

248 errors, but is only correct if the working correlation structure 

249 is correctly specified. The "bias reduced" estimator of Mancl and 

250 DeRouen (Biometrics, 2001) reduces the downward bias of the robust 

251 estimator. 

252 

253 The robust covariance provided here follows Liang and Zeger (1986) 

254 and agrees with R's gee implementation. To obtain the robust 

255 standard errors reported in Stata, multiply by sqrt(N / (N - g)), 

256 where N is the total sample size, and g is the average group size. 

257 

258 Examples 

259 -------- 

260 %(example)s 

261""" 

262 

263_gee_family_doc = """\ 

264 The default is Gaussian. To specify the binomial 

265 distribution use `family=sm.families.Binomial()`. Each family 

266 can take a link instance as an argument. See 

267 statsmodels.genmod.families.family for more information.""" 

268 

269_gee_ordinal_family_doc = """\ 

270 The only family supported is `Binomial`. The default `Logit` 

271 link may be replaced with `probit` if desired.""" 

272 

273_gee_nominal_family_doc = """\ 

274 The default value `None` uses a multinomial logit family 

275 specifically designed for use with GEE. Setting this 

276 argument to a non-default value is not currently supported.""" 

277 

278_gee_fit_doc = """ 

279 Fits a marginal regression model using generalized estimating 

280 equations (GEE). 

281 

282 Parameters 

283 ---------- 

284 maxiter : int 

285 The maximum number of iterations 

286 ctol : float 

287 The convergence criterion for stopping the Gauss-Seidel 

288 iterations 

289 start_params : array_like 

290 A vector of starting values for the regression 

291 coefficients. If None, a default is chosen. 

292 params_niter : int 

293 The number of Gauss-Seidel updates of the mean structure 

294 parameters that take place prior to each update of the 

295 dependence structure. 

296 first_dep_update : int 

297 No dependence structure updates occur before this 

298 iteration number. 

299 cov_type : str 

300 One of "robust", "naive", or "bias_reduced". 

301 ddof_scale : scalar or None 

302 The scale parameter is estimated as the sum of squared 

303 Pearson residuals divided by `N - ddof_scale`, where N 

304 is the total sample size. If `ddof_scale` is None, the 

305 number of covariates (including an intercept if present) 

306 is used. 

307 scaling_factor : scalar 

308 The estimated covariance of the parameter estimates is 

309 scaled by this value. Default is 1, Stata uses N / (N - g), 

310 where N is the total sample size and g is the average group 

311 size. 

312 scale : str or float, optional 

313 `scale` can be None, 'X2', or a float 

314 If a float, its value is used as the scale parameter. 

315 The default value is None, which uses `X2` (Pearson's 

316 chi-square) for Gamma, Gaussian, and Inverse Gaussian. 

317 The default is 1 for the Binomial and Poisson families. 

318 

319 Returns 

320 ------- 

321 An instance of the GEEResults class or subclass 

322 

323 Notes 

324 ----- 

325 If convergence difficulties occur, increase the values of 

326 `first_dep_update` and/or `params_niter`. Setting 

327 `first_dep_update` to a greater value (e.g. ~10-20) causes the 

328 algorithm to move close to the GLM solution before attempting 

329 to identify the dependence structure. 

330 

331 For the Gaussian family, there is no benefit to setting 

332 `params_niter` to a value greater than 1, since the mean 

333 structure parameters converge in one step. 

334""" 

335 

336_gee_results_doc = """ 

337 Attributes 

338 ---------- 

339 

340 cov_params_default : ndarray 

341 default covariance of the parameter estimates. Is chosen among one 

342 of the following three based on `cov_type` 

343 cov_robust : ndarray 

344 covariance of the parameter estimates that is robust 

345 cov_naive : ndarray 

346 covariance of the parameter estimates that is not robust to 

347 correlation or variance misspecification 

348 cov_robust_bc : ndarray 

349 covariance of the parameter estimates that is robust and bias 

350 reduced 

351 converged : bool 

352 indicator for convergence of the optimization. 

353 True if the norm of the score is smaller than a threshold 

354 cov_type : str 

355 string indicating whether a "robust", "naive" or "bias_reduced" 

356 covariance is used as default 

357 fit_history : dict 

358 Contains information about the iterations. 

359 fittedvalues : ndarray 

360 Linear predicted values for the fitted model. 

361 dot(exog, params) 

362 model : class instance 

363 Pointer to GEE model instance that called `fit`. 

364 normalized_cov_params : ndarray 

365 See GEE docstring 

366 params : ndarray 

367 The coefficients of the fitted model. Note that 

368 interpretation of the coefficients often depends on the 

369 distribution family and the data. 

370 scale : float 

371 The estimate of the scale / dispersion for the model fit. 

372 See GEE.fit for more information. 

373 score_norm : float 

374 norm of the score at the end of the iterative estimation. 

375 bse : ndarray 

376 The standard errors of the fitted GEE parameters. 

377""" 

378 

379_gee_example = """ 

380 Logistic regression with autoregressive working dependence: 

381 

382 >>> import statsmodels.api as sm 

383 >>> family = sm.families.Binomial() 

384 >>> va = sm.cov_struct.Autoregressive() 

385 >>> model = sm.GEE(endog, exog, group, family=family, cov_struct=va) 

386 >>> result = model.fit() 

387 >>> print(result.summary()) 

388 

389 Use formulas to fit a Poisson GLM with independent working 

390 dependence: 

391 

392 >>> import statsmodels.api as sm 

393 >>> fam = sm.families.Poisson() 

394 >>> ind = sm.cov_struct.Independence() 

395 >>> model = sm.GEE.from_formula("y ~ age + trt + base", "subject", \ 

396 data, cov_struct=ind, family=fam) 

397 >>> result = model.fit() 

398 >>> print(result.summary()) 

399 

400 Equivalent, using the formula API: 

401 

402 >>> import statsmodels.api as sm 

403 >>> import statsmodels.formula.api as smf 

404 >>> fam = sm.families.Poisson() 

405 >>> ind = sm.cov_struct.Independence() 

406 >>> model = smf.gee("y ~ age + trt + base", "subject", \ 

407 data, cov_struct=ind, family=fam) 

408 >>> result = model.fit() 

409 >>> print(result.summary()) 

410""" 

411 

412_gee_ordinal_example = """ 

413 Fit an ordinal regression model using GEE, with "global 

414 odds ratio" dependence: 

415 

416 >>> import statsmodels.api as sm 

417 >>> gor = sm.cov_struct.GlobalOddsRatio("ordinal") 

418 >>> model = sm.OrdinalGEE(endog, exog, groups, cov_struct=gor) 

419 >>> result = model.fit() 

420 >>> print(result.summary()) 

421 

422 Using formulas: 

423 

424 >>> import statsmodels.formula.api as smf 

425 >>> model = smf.ordinal_gee("y ~ x1 + x2", groups, data, 

426 cov_struct=gor) 

427 >>> result = model.fit() 

428 >>> print(result.summary()) 

429""" 

430 

431_gee_nominal_example = """ 

432 Fit a nominal regression model using GEE: 

433 

434 >>> import statsmodels.api as sm 

435 >>> import statsmodels.formula.api as smf 

436 >>> gor = sm.cov_struct.GlobalOddsRatio("nominal") 

437 >>> model = sm.NominalGEE(endog, exog, groups, cov_struct=gor) 

438 >>> result = model.fit() 

439 >>> print(result.summary()) 

440 

441 Using formulas: 

442 

443 >>> import statsmodels.api as sm 

444 >>> model = sm.NominalGEE.from_formula("y ~ x1 + x2", groups, 

445 data, cov_struct=gor) 

446 >>> result = model.fit() 

447 >>> print(result.summary()) 

448 

449 Using the formula API: 

450 

451 >>> import statsmodels.formula.api as smf 

452 >>> model = smf.nominal_gee("y ~ x1 + x2", groups, data, 

453 cov_struct=gor) 

454 >>> result = model.fit() 

455 >>> print(result.summary()) 

456""" 

457 

458 

459def _check_args(endog, exog, groups, time, offset, exposure): 

460 

461 if endog.size != exog.shape[0]: 

462 raise ValueError("Leading dimension of 'exog' should match " 

463 "length of 'endog'") 

464 

465 if groups.size != endog.size: 

466 raise ValueError("'groups' and 'endog' should have the same size") 

467 

468 if time is not None and (time.size != endog.size): 

469 raise ValueError("'time' and 'endog' should have the same size") 

470 

471 if offset is not None and (offset.size != endog.size): 

472 raise ValueError("'offset and 'endog' should have the same size") 

473 

474 if exposure is not None and (exposure.size != endog.size): 

475 raise ValueError("'exposure' and 'endog' should have the same size") 

476 

477 

478class GEE(base.Model): 

479 

480 __doc__ = ( 

481 " Marginal Regression Model using Generalized Estimating " 

482 "Equations.\n" + _gee_init_doc % 

483 {'extra_params': base._missing_param_doc, 

484 'family_doc': _gee_family_doc, 

485 'example': _gee_example}) 

486 

487 cached_means = None 

488 

489 def __init__(self, endog, exog, groups, time=None, family=None, 

490 cov_struct=None, missing='none', offset=None, 

491 exposure=None, dep_data=None, constraint=None, 

492 update_dep=True, weights=None, **kwargs): 

493 

494 if family is not None: 

495 if not isinstance(family.link, tuple(family.safe_links)): 

496 import warnings 

497 msg = ("The {0} link function does not respect the " 

498 "domain of the {1} family.") 

499 warnings.warn(msg.format(family.link.__class__.__name__, 

500 family.__class__.__name__), 

501 DomainWarning) 

502 

503 groups = np.asarray(groups) # in case groups is pandas 

504 

505 if "missing_idx" in kwargs and kwargs["missing_idx"] is not None: 

506 # If here, we are entering from super.from_formula; missing 

507 # has already been dropped from endog and exog, but not from 

508 # the other variables. 

509 ii = ~kwargs["missing_idx"] 

510 groups = groups[ii] 

511 if time is not None: 

512 time = time[ii] 

513 if offset is not None: 

514 offset = offset[ii] 

515 if exposure is not None: 

516 exposure = exposure[ii] 

517 del kwargs["missing_idx"] 

518 

519 _check_args(endog, exog, groups, time, offset, exposure) 

520 

521 self.missing = missing 

522 self.dep_data = dep_data 

523 self.constraint = constraint 

524 self.update_dep = update_dep 

525 

526 self._fit_history = defaultdict(list) 

527 

528 # Pass groups, time, offset, and dep_data so they are 

529 # processed for missing data along with endog and exog. 

530 # Calling super creates self.exog, self.endog, etc. as 

531 # ndarrays and the original exog, endog, etc. are 

532 # self.data.endog, etc. 

533 super(GEE, self).__init__(endog, exog, groups=groups, 

534 time=time, offset=offset, 

535 exposure=exposure, weights=weights, 

536 dep_data=dep_data, missing=missing, 

537 **kwargs) 

538 

539 self._init_keys.extend(["update_dep", "constraint", "family", 

540 "cov_struct"]) 

541 

542 # Handle the family argument 

543 if family is None: 

544 family = families.Gaussian() 

545 else: 

546 if not issubclass(family.__class__, families.Family): 

547 raise ValueError("GEE: `family` must be a genmod " 

548 "family instance") 

549 self.family = family 

550 

551 # Handle the cov_struct argument 

552 if cov_struct is None: 

553 cov_struct = cov_structs.Independence() 

554 else: 

555 if not issubclass(cov_struct.__class__, cov_structs.CovStruct): 

556 raise ValueError("GEE: `cov_struct` must be a genmod " 

557 "cov_struct instance") 

558 

559 self.cov_struct = cov_struct 

560 

561 # Handle the offset and exposure 

562 self._offset_exposure = None 

563 if offset is not None: 

564 self._offset_exposure = self.offset.copy() 

565 self.offset = offset 

566 if exposure is not None: 

567 if not isinstance(self.family.link, families.links.Log): 

568 raise ValueError( 

569 "exposure can only be used with the log link function") 

570 if self._offset_exposure is not None: 

571 self._offset_exposure += np.log(exposure) 

572 else: 

573 self._offset_exposure = np.log(exposure) 

574 self.exposure = exposure 

575 

576 # Handle the constraint 

577 self.constraint = None 

578 if constraint is not None: 

579 if len(constraint) != 2: 

580 raise ValueError("GEE: `constraint` must be a 2-tuple.") 

581 if constraint[0].shape[1] != self.exog.shape[1]: 

582 raise ValueError( 

583 "GEE: the left hand side of the constraint must have " 

584 "the same number of columns as the exog matrix.") 

585 self.constraint = ParameterConstraint(constraint[0], 

586 constraint[1], 

587 self.exog) 

588 

589 if self._offset_exposure is not None: 

590 self._offset_exposure += self.constraint.offset_increment() 

591 else: 

592 self._offset_exposure = ( 

593 self.constraint.offset_increment().copy()) 

594 self.exog = self.constraint.reduced_exog() 

595 

596 # Create list of row indices for each group 

597 group_labels, ix = np.unique(self.groups, return_inverse=True) 

598 se = pd.Series(index=np.arange(len(ix)), dtype="int") 

599 gb = se.groupby(ix).groups 

600 dk = [(lb, np.asarray(gb[k])) for k, lb in enumerate(group_labels)] 

601 self.group_indices = dict(dk) 

602 self.group_labels = group_labels 

603 

604 # Convert the data to the internal representation, which is a 

605 # list of arrays, corresponding to the groups. 

606 self.endog_li = self.cluster_list(self.endog) 

607 self.exog_li = self.cluster_list(self.exog) 

608 

609 if self.weights is not None: 

610 self.weights_li = self.cluster_list(self.weights) 

611 self.weights_li = [x[0] for x in self.weights_li] 

612 self.weights_li = np.asarray(self.weights_li) 

613 

614 self.num_group = len(self.endog_li) 

615 

616 # Time defaults to a 1d grid with equal spacing 

617 if self.time is not None: 

618 self.time = np.asarray(self.time, np.float64) 

619 if self.time.ndim == 1: 

620 self.time = self.time[:, None] 

621 self.time_li = self.cluster_list(self.time) 

622 else: 

623 self.time_li = \ 

624 [np.arange(len(y), dtype=np.float64)[:, None] 

625 for y in self.endog_li] 

626 self.time = np.concatenate(self.time_li) 

627 

628 if self._offset_exposure is not None: 

629 self.offset_li = self.cluster_list(self._offset_exposure) 

630 else: 

631 self.offset_li = None 

632 if constraint is not None: 

633 self.constraint.exog_fulltrans_li = \ 

634 self.cluster_list(self.constraint.exog_fulltrans) 

635 

636 self.family = family 

637 

638 self.cov_struct.initialize(self) 

639 

640 # Total sample size 

641 group_ns = [len(y) for y in self.endog_li] 

642 self.nobs = sum(group_ns) 

643 # The following are column based, not on rank see #1928 

644 self.df_model = self.exog.shape[1] - 1 # assumes constant 

645 self.df_resid = self.nobs - self.exog.shape[1] 

646 

647 # Skip the covariance updates if all groups have a single 

648 # observation (reduces to fitting a GLM). 

649 maxgroup = max([len(x) for x in self.endog_li]) 

650 if maxgroup == 1: 

651 self.update_dep = False 

652 

653 # Override to allow groups and time to be passed as variable 

654 # names. 

655 @classmethod 

656 def from_formula(cls, formula, groups, data, subset=None, 

657 time=None, offset=None, exposure=None, 

658 *args, **kwargs): 

659 """ 

660 Create a GEE model instance from a formula and dataframe. 

661 

662 Parameters 

663 ---------- 

664 formula : str or generic Formula object 

665 The formula specifying the model 

666 groups : array_like or string 

667 Array of grouping labels. If a string, this is the name 

668 of a variable in `data` that contains the grouping labels. 

669 data : array_like 

670 The data for the model. 

671 subset : array_like 

672 An array-like object of booleans, integers, or index 

673 values that indicate the subset of the data to used when 

674 fitting the model. 

675 time : array_like or string 

676 The time values, used for dependence structures involving 

677 distances between observations. If a string, this is the 

678 name of a variable in `data` that contains the time 

679 values. 

680 offset : array_like or string 

681 The offset values, added to the linear predictor. If a 

682 string, this is the name of a variable in `data` that 

683 contains the offset values. 

684 exposure : array_like or string 

685 The exposure values, only used if the link function is the 

686 logarithm function, in which case the log of `exposure` 

687 is added to the offset (if any). If a string, this is the 

688 name of a variable in `data` that contains the offset 

689 values. 

690 %(missing_param_doc)s 

691 args : extra arguments 

692 These are passed to the model 

693 kwargs : extra keyword arguments 

694 These are passed to the model with two exceptions. `dep_data` 

695 is processed as described below. The ``eval_env`` keyword is 

696 passed to patsy. It can be either a 

697 :class:`patsy:patsy.EvalEnvironment` object or an integer 

698 indicating the depth of the namespace to use. For example, the 

699 default ``eval_env=0`` uses the calling namespace. 

700 If you wish to use a "clean" environment set ``eval_env=-1``. 

701 

702 Optional arguments 

703 ------------------ 

704 dep_data : str or array_like 

705 Data used for estimating the dependence structure. See 

706 specific dependence structure classes (e.g. Nested) for 

707 details. If `dep_data` is a string, it is interpreted as 

708 a formula that is applied to `data`. If it is an array, it 

709 must be an array of strings corresponding to column names in 

710 `data`. Otherwise it must be an array-like with the same 

711 number of rows as data. 

712 

713 Returns 

714 ------- 

715 model : GEE model instance 

716 

717 Notes 

718 ----- 

719 `data` must define __getitem__ with the keys in the formula 

720 terms args and kwargs are passed on to the model 

721 instantiation. E.g., a numpy structured or rec array, a 

722 dictionary, or a pandas DataFrame. 

723 """ % {'missing_param_doc': base._missing_param_doc} 

724 

725 groups_name = "Groups" 

726 if isinstance(groups, str): 

727 groups_name = groups 

728 groups = data[groups] 

729 

730 if isinstance(time, str): 

731 time = data[time] 

732 

733 if isinstance(offset, str): 

734 offset = data[offset] 

735 

736 if isinstance(exposure, str): 

737 exposure = data[exposure] 

738 

739 dep_data = kwargs.get("dep_data") 

740 dep_data_names = None 

741 if dep_data is not None: 

742 if isinstance(dep_data, str): 

743 dep_data = patsy.dmatrix(dep_data, data, 

744 return_type='dataframe') 

745 dep_data_names = dep_data.columns.tolist() 

746 else: 

747 dep_data_names = list(dep_data) 

748 dep_data = data[dep_data] 

749 kwargs["dep_data"] = np.asarray(dep_data) 

750 

751 model = super(GEE, cls).from_formula(formula, data=data, subset=subset, 

752 groups=groups, time=time, 

753 offset=offset, 

754 exposure=exposure, 

755 *args, **kwargs) 

756 

757 if dep_data_names is not None: 

758 model._dep_data_names = dep_data_names 

759 model._groups_name = groups_name 

760 

761 return model 

762 

763 def cluster_list(self, array): 

764 """ 

765 Returns `array` split into subarrays corresponding to the 

766 cluster structure. 

767 """ 

768 

769 if array.ndim == 1: 

770 return [np.array(array[self.group_indices[k]]) 

771 for k in self.group_labels] 

772 else: 

773 return [np.array(array[self.group_indices[k], :]) 

774 for k in self.group_labels] 

775 

776 def compare_score_test(self, submodel): 

777 """ 

778 Perform a score test for the given submodel against this model. 

779 

780 Parameters 

781 ---------- 

782 submodel : GEEResults instance 

783 A fitted GEE model that is a submodel of this model. 

784 

785 Returns 

786 ------- 

787 A dictionary with keys "statistic", "p-value", and "df", 

788 containing the score test statistic, its chi^2 p-value, 

789 and the degrees of freedom used to compute the p-value. 

790 

791 Notes 

792 ----- 

793 The score test can be performed without calling 'fit' on the 

794 larger model. The provided submodel must be obtained from a 

795 fitted GEE. 

796 

797 This method performs the same score test as can be obtained by 

798 fitting the GEE with a linear constraint and calling `score_test` 

799 on the results. 

800 

801 References 

802 ---------- 

803 Xu Guo and Wei Pan (2002). "Small sample performance of the score 

804 test in GEE". 

805 http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf 

806 """ 

807 

808 # Since the model has not been fit, its scaletype has not been 

809 # set. So give it the scaletype of the submodel. 

810 self.scaletype = submodel.model.scaletype 

811 

812 # Check consistency between model and submodel (not a comprehensive 

813 # check) 

814 submod = submodel.model 

815 if self.exog.shape[0] != submod.exog.shape[0]: 

816 msg = "Model and submodel have different numbers of cases." 

817 raise ValueError(msg) 

818 if self.exog.shape[1] == submod.exog.shape[1]: 

819 msg = "Model and submodel have the same number of variables" 

820 warnings.warn(msg) 

821 if not isinstance(self.family, type(submod.family)): 

822 msg = "Model and submodel have different GLM families." 

823 warnings.warn(msg) 

824 if not isinstance(self.cov_struct, type(submod.cov_struct)): 

825 warnings.warn("Model and submodel have different GEE covariance " 

826 "structures.") 

827 if not np.equal(self.weights, submod.weights).all(): 

828 msg = "Model and submodel should have the same weights." 

829 warnings.warn(msg) 

830 

831 # Get the positions of the submodel variables in the 

832 # parent model 

833 qm, qc = _score_test_submodel(self, submodel.model) 

834 if qm is None: 

835 msg = "The provided model is not a submodel." 

836 raise ValueError(msg) 

837 

838 # Embed the submodel params into a params vector for the 

839 # parent model 

840 params_ex = np.dot(qm, submodel.params) 

841 

842 # Attempt to preserve the state of the parent model 

843 cov_struct_save = self.cov_struct 

844 import copy 

845 cached_means_save = copy.deepcopy(self.cached_means) 

846 

847 # Get the score vector of the submodel params in 

848 # the parent model 

849 self.cov_struct = submodel.cov_struct 

850 self.update_cached_means(params_ex) 

851 _, score = self._update_mean_params() 

852 if score is None: 

853 msg = "Singular matrix encountered in GEE score test" 

854 warnings.warn(msg, ConvergenceWarning) 

855 return None 

856 

857 if not hasattr(self, "ddof_scale"): 

858 self.ddof_scale = self.exog.shape[1] 

859 

860 if not hasattr(self, "scaling_factor"): 

861 self.scaling_factor = 1 

862 

863 _, ncov1, cmat = self._covmat() 

864 scale = self.estimate_scale() 

865 cmat = cmat / scale ** 2 

866 score2 = np.dot(qc.T, score) / scale 

867 

868 amat = np.linalg.inv(ncov1) 

869 

870 bmat_11 = np.dot(qm.T, np.dot(cmat, qm)) 

871 bmat_22 = np.dot(qc.T, np.dot(cmat, qc)) 

872 bmat_12 = np.dot(qm.T, np.dot(cmat, qc)) 

873 

874 amat_11 = np.dot(qm.T, np.dot(amat, qm)) 

875 amat_12 = np.dot(qm.T, np.dot(amat, qc)) 

876 

877 score_cov = bmat_22 - np.dot(amat_12.T, 

878 np.linalg.solve(amat_11, bmat_12)) 

879 score_cov -= np.dot(bmat_12.T, 

880 np.linalg.solve(amat_11, amat_12)) 

881 score_cov += np.dot(amat_12.T, 

882 np.dot(np.linalg.solve(amat_11, bmat_11), 

883 np.linalg.solve(amat_11, amat_12))) 

884 

885 # Attempt to restore state 

886 self.cov_struct = cov_struct_save 

887 self.cached_means = cached_means_save 

888 

889 from scipy.stats.distributions import chi2 

890 score_statistic = np.dot(score2, 

891 np.linalg.solve(score_cov, score2)) 

892 score_df = len(score2) 

893 score_pvalue = 1 - chi2.cdf(score_statistic, score_df) 

894 return {"statistic": score_statistic, 

895 "df": score_df, 

896 "p-value": score_pvalue} 

897 

898 def estimate_scale(self): 

899 """ 

900 Estimate the dispersion/scale. 

901 """ 

902 

903 if self.scaletype is None: 

904 if isinstance(self.family, (families.Binomial, families.Poisson, 

905 families.NegativeBinomial, 

906 _Multinomial)): 

907 return 1. 

908 elif isinstance(self.scaletype, float): 

909 return np.array(self.scaletype) 

910 

911 endog = self.endog_li 

912 cached_means = self.cached_means 

913 nobs = self.nobs 

914 varfunc = self.family.variance 

915 

916 scale = 0. 

917 fsum = 0. 

918 for i in range(self.num_group): 

919 

920 if len(endog[i]) == 0: 

921 continue 

922 

923 expval, _ = cached_means[i] 

924 

925 f = self.weights_li[i] if self.weights is not None else 1. 

926 

927 sdev = np.sqrt(varfunc(expval)) 

928 resid = (endog[i] - expval) / sdev 

929 

930 scale += f * np.sum(resid ** 2) 

931 fsum += f * len(endog[i]) 

932 

933 scale /= (fsum * (nobs - self.ddof_scale) / float(nobs)) 

934 

935 return scale 

936 

937 def mean_deriv(self, exog, lin_pred): 

938 """ 

939 Derivative of the expected endog with respect to the parameters. 

940 

941 Parameters 

942 ---------- 

943 exog : array_like 

944 The exogeneous data at which the derivative is computed. 

945 lin_pred : array_like 

946 The values of the linear predictor. 

947 

948 Returns 

949 ------- 

950 The value of the derivative of the expected endog with respect 

951 to the parameter vector. 

952 

953 Notes 

954 ----- 

955 If there is an offset or exposure, it should be added to 

956 `lin_pred` prior to calling this function. 

957 """ 

958 

959 idl = self.family.link.inverse_deriv(lin_pred) 

960 dmat = exog * idl[:, None] 

961 return dmat 

962 

963 def mean_deriv_exog(self, exog, params, offset_exposure=None): 

964 """ 

965 Derivative of the expected endog with respect to exog. 

966 

967 Parameters 

968 ---------- 

969 exog : array_like 

970 Values of the independent variables at which the derivative 

971 is calculated. 

972 params : array_like 

973 Parameter values at which the derivative is calculated. 

974 offset_exposure : array_like, optional 

975 Combined offset and exposure. 

976 

977 Returns 

978 ------- 

979 The derivative of the expected endog with respect to exog. 

980 """ 

981 

982 lin_pred = np.dot(exog, params) 

983 if offset_exposure is not None: 

984 lin_pred += offset_exposure 

985 

986 idl = self.family.link.inverse_deriv(lin_pred) 

987 dmat = np.outer(idl, params) 

988 return dmat 

989 

990 def _update_mean_params(self): 

991 """ 

992 Returns 

993 ------- 

994 update : array_like 

995 The update vector such that params + update is the next 

996 iterate when solving the score equations. 

997 score : array_like 

998 The current value of the score equations, not 

999 incorporating the scale parameter. If desired, 

1000 multiply this vector by the scale parameter to 

1001 incorporate the scale. 

1002 """ 

1003 

1004 endog = self.endog_li 

1005 exog = self.exog_li 

1006 

1007 cached_means = self.cached_means 

1008 

1009 varfunc = self.family.variance 

1010 

1011 bmat, score = 0, 0 

1012 for i in range(self.num_group): 

1013 

1014 expval, lpr = cached_means[i] 

1015 resid = endog[i] - expval 

1016 dmat = self.mean_deriv(exog[i], lpr) 

1017 sdev = np.sqrt(varfunc(expval)) 

1018 

1019 rslt = self.cov_struct.covariance_matrix_solve(expval, i, 

1020 sdev, (dmat, resid)) 

1021 if rslt is None: 

1022 return None, None 

1023 vinv_d, vinv_resid = tuple(rslt) 

1024 

1025 f = self.weights_li[i] if self.weights is not None else 1. 

1026 

1027 bmat += f * np.dot(dmat.T, vinv_d) 

1028 score += f * np.dot(dmat.T, vinv_resid) 

1029 

1030 update = np.linalg.solve(bmat, score) 

1031 

1032 self._fit_history["cov_adjust"].append( 

1033 self.cov_struct.cov_adjust) 

1034 

1035 return update, score 

1036 

1037 def update_cached_means(self, mean_params): 

1038 """ 

1039 cached_means should always contain the most recent calculation 

1040 of the group-wise mean vectors. This function should be 

1041 called every time the regression parameters are changed, to 

1042 keep the cached means up to date. 

1043 """ 

1044 

1045 endog = self.endog_li 

1046 exog = self.exog_li 

1047 offset = self.offset_li 

1048 

1049 linkinv = self.family.link.inverse 

1050 

1051 self.cached_means = [] 

1052 

1053 for i in range(self.num_group): 

1054 

1055 if len(endog[i]) == 0: 

1056 continue 

1057 

1058 lpr = np.dot(exog[i], mean_params) 

1059 if offset is not None: 

1060 lpr += offset[i] 

1061 expval = linkinv(lpr) 

1062 

1063 self.cached_means.append((expval, lpr)) 

1064 

1065 def _covmat(self): 

1066 """ 

1067 Returns the sampling covariance matrix of the regression 

1068 parameters and related quantities. 

1069 

1070 Returns 

1071 ------- 

1072 cov_robust : array_like 

1073 The robust, or sandwich estimate of the covariance, which 

1074 is meaningful even if the working covariance structure is 

1075 incorrectly specified. 

1076 cov_naive : array_like 

1077 The model-based estimate of the covariance, which is 

1078 meaningful if the covariance structure is correctly 

1079 specified. 

1080 cmat : array_like 

1081 The center matrix of the sandwich expression, used in 

1082 obtaining score test results. 

1083 """ 

1084 

1085 endog = self.endog_li 

1086 exog = self.exog_li 

1087 varfunc = self.family.variance 

1088 cached_means = self.cached_means 

1089 

1090 # Calculate the naive (model-based) and robust (sandwich) 

1091 # covariances. 

1092 bmat, cmat = 0, 0 

1093 for i in range(self.num_group): 

1094 

1095 expval, lpr = cached_means[i] 

1096 resid = endog[i] - expval 

1097 dmat = self.mean_deriv(exog[i], lpr) 

1098 sdev = np.sqrt(varfunc(expval)) 

1099 

1100 rslt = self.cov_struct.covariance_matrix_solve( 

1101 expval, i, sdev, (dmat, resid)) 

1102 if rslt is None: 

1103 return None, None, None, None 

1104 vinv_d, vinv_resid = tuple(rslt) 

1105 

1106 f = self.weights_li[i] if self.weights is not None else 1. 

1107 

1108 bmat += f * np.dot(dmat.T, vinv_d) 

1109 dvinv_resid = f * np.dot(dmat.T, vinv_resid) 

1110 cmat += np.outer(dvinv_resid, dvinv_resid) 

1111 

1112 scale = self.estimate_scale() 

1113 

1114 bmati = np.linalg.inv(bmat) 

1115 cov_naive = bmati * scale 

1116 cov_robust = np.dot(bmati, np.dot(cmat, bmati)) 

1117 

1118 cov_naive *= self.scaling_factor 

1119 cov_robust *= self.scaling_factor 

1120 return cov_robust, cov_naive, cmat 

1121 

1122 # Calculate the bias-corrected sandwich estimate of Mancl and 

1123 # DeRouen. 

1124 def _bc_covmat(self, cov_naive): 

1125 

1126 cov_naive = cov_naive / self.scaling_factor 

1127 endog = self.endog_li 

1128 exog = self.exog_li 

1129 varfunc = self.family.variance 

1130 cached_means = self.cached_means 

1131 scale = self.estimate_scale() 

1132 

1133 bcm = 0 

1134 for i in range(self.num_group): 

1135 

1136 expval, lpr = cached_means[i] 

1137 resid = endog[i] - expval 

1138 dmat = self.mean_deriv(exog[i], lpr) 

1139 sdev = np.sqrt(varfunc(expval)) 

1140 

1141 rslt = self.cov_struct.covariance_matrix_solve( 

1142 expval, i, sdev, (dmat,)) 

1143 if rslt is None: 

1144 return None 

1145 vinv_d = rslt[0] 

1146 vinv_d /= scale 

1147 

1148 hmat = np.dot(vinv_d, cov_naive) 

1149 hmat = np.dot(hmat, dmat.T).T 

1150 

1151 f = self.weights_li[i] if self.weights is not None else 1. 

1152 

1153 aresid = np.linalg.solve(np.eye(len(resid)) - hmat, resid) 

1154 rslt = self.cov_struct.covariance_matrix_solve( 

1155 expval, i, sdev, (aresid,)) 

1156 if rslt is None: 

1157 return None 

1158 srt = rslt[0] 

1159 srt = f * np.dot(dmat.T, srt) / scale 

1160 bcm += np.outer(srt, srt) 

1161 

1162 cov_robust_bc = np.dot(cov_naive, np.dot(bcm, cov_naive)) 

1163 cov_robust_bc *= self.scaling_factor 

1164 

1165 return cov_robust_bc 

1166 

1167 def predict(self, params, exog=None, offset=None, 

1168 exposure=None, linear=False): 

1169 """ 

1170 Return predicted values for a marginal regression model fit 

1171 using GEE. 

1172 

1173 Parameters 

1174 ---------- 

1175 params : array_like 

1176 Parameters / coefficients of a marginal regression model. 

1177 exog : array_like, optional 

1178 Design / exogenous data. If exog is None, model exog is 

1179 used. 

1180 offset : array_like, optional 

1181 Offset for exog if provided. If offset is None, model 

1182 offset is used. 

1183 exposure : array_like, optional 

1184 Exposure for exog, if exposure is None, model exposure is 

1185 used. Only allowed if link function is the logarithm. 

1186 linear : bool 

1187 If True, returns the linear predicted values. If False, 

1188 returns the value of the inverse of the model's link 

1189 function at the linear predicted values. 

1190 

1191 Returns 

1192 ------- 

1193 An array of fitted values 

1194 

1195 Notes 

1196 ----- 

1197 Using log(V) as the offset is equivalent to using V as the 

1198 exposure. If exposure U and offset V are both provided, then 

1199 log(U) + V is added to the linear predictor. 

1200 """ 

1201 

1202 # TODO: many paths through this, not well covered in tests 

1203 

1204 if exposure is not None: 

1205 if not isinstance(self.family.link, families.links.Log): 

1206 raise ValueError( 

1207 "exposure can only be used with the log link function") 

1208 

1209 # This is the combined offset and exposure 

1210 _offset = 0. 

1211 

1212 # Using model exog 

1213 if exog is None: 

1214 exog = self.exog 

1215 

1216 if not isinstance(self.family.link, families.links.Log): 

1217 # Do not need to worry about exposure 

1218 if offset is None: 

1219 if self._offset_exposure is not None: 

1220 _offset = self._offset_exposure.copy() 

1221 else: 

1222 _offset = offset 

1223 

1224 else: 

1225 if offset is None and exposure is None: 

1226 if self._offset_exposure is not None: 

1227 _offset = self._offset_exposure 

1228 elif offset is None and exposure is not None: 

1229 _offset = np.log(exposure) 

1230 if hasattr(self, "offset"): 

1231 _offset = _offset + self.offset 

1232 elif offset is not None and exposure is None: 

1233 _offset = offset 

1234 if hasattr(self, "exposure"): 

1235 _offset = offset + np.log(self.exposure) 

1236 else: 

1237 _offset = offset + np.log(exposure) 

1238 

1239 # exog is provided: this is simpler than above because we 

1240 # never use model exog or exposure if exog is provided. 

1241 else: 

1242 if offset is not None: 

1243 _offset = _offset + offset 

1244 if exposure is not None: 

1245 _offset += np.log(exposure) 

1246 

1247 lin_pred = _offset + np.dot(exog, params) 

1248 

1249 if not linear: 

1250 return self.family.link.inverse(lin_pred) 

1251 

1252 return lin_pred 

1253 

1254 def _starting_params(self): 

1255 

1256 model = GLM(self.endog, self.exog, family=self.family, 

1257 offset=self._offset_exposure, 

1258 freq_weights=self.weights) 

1259 result = model.fit() 

1260 return result.params 

1261 

1262 @Appender(_gee_fit_doc) 

1263 def fit(self, maxiter=60, ctol=1e-6, start_params=None, 

1264 params_niter=1, first_dep_update=0, 

1265 cov_type='robust', ddof_scale=None, scaling_factor=1., 

1266 scale=None): 

1267 

1268 self.scaletype = scale 

1269 

1270 # Subtract this number from the total sample size when 

1271 # normalizing the scale parameter estimate. 

1272 if ddof_scale is None: 

1273 self.ddof_scale = self.exog.shape[1] 

1274 else: 

1275 if not ddof_scale >= 0: 

1276 raise ValueError( 

1277 "ddof_scale must be a non-negative number or None") 

1278 self.ddof_scale = ddof_scale 

1279 

1280 self.scaling_factor = scaling_factor 

1281 

1282 self._fit_history = defaultdict(list) 

1283 

1284 if self.weights is not None and cov_type == 'naive': 

1285 raise ValueError("when using weights, cov_type may not be naive") 

1286 

1287 if start_params is None: 

1288 mean_params = self._starting_params() 

1289 else: 

1290 start_params = np.asarray(start_params) 

1291 mean_params = start_params.copy() 

1292 

1293 self.update_cached_means(mean_params) 

1294 

1295 del_params = -1. 

1296 num_assoc_updates = 0 

1297 for itr in range(maxiter): 

1298 

1299 update, score = self._update_mean_params() 

1300 if update is None: 

1301 warnings.warn("Singular matrix encountered in GEE update", 

1302 ConvergenceWarning) 

1303 break 

1304 mean_params += update 

1305 self.update_cached_means(mean_params) 

1306 

1307 # L2 norm of the change in mean structure parameters at 

1308 # this iteration. 

1309 del_params = np.sqrt(np.sum(score ** 2)) 

1310 

1311 self._fit_history['params'].append(mean_params.copy()) 

1312 self._fit_history['score'].append(score) 

1313 self._fit_history['dep_params'].append( 

1314 self.cov_struct.dep_params) 

1315 

1316 # Do not exit until the association parameters have been 

1317 # updated at least once. 

1318 if (del_params < ctol and 

1319 (num_assoc_updates > 0 or self.update_dep is False)): 

1320 break 

1321 

1322 # Update the dependence structure 

1323 if (self.update_dep and (itr % params_niter) == 0 

1324 and (itr >= first_dep_update)): 

1325 self._update_assoc(mean_params) 

1326 num_assoc_updates += 1 

1327 

1328 if del_params >= ctol: 

1329 warnings.warn("Iteration limit reached prior to convergence", 

1330 IterationLimitWarning) 

1331 

1332 if mean_params is None: 

1333 warnings.warn("Unable to estimate GEE parameters.", 

1334 ConvergenceWarning) 

1335 return None 

1336 

1337 bcov, ncov, _ = self._covmat() 

1338 if bcov is None: 

1339 warnings.warn("Estimated covariance structure for GEE " 

1340 "estimates is singular", ConvergenceWarning) 

1341 return None 

1342 bc_cov = None 

1343 if cov_type == "bias_reduced": 

1344 bc_cov = self._bc_covmat(ncov) 

1345 

1346 if self.constraint is not None: 

1347 x = mean_params.copy() 

1348 mean_params, bcov = self._handle_constraint(mean_params, bcov) 

1349 if mean_params is None: 

1350 warnings.warn("Unable to estimate constrained GEE " 

1351 "parameters.", ConvergenceWarning) 

1352 return None 

1353 

1354 y, ncov = self._handle_constraint(x, ncov) 

1355 if y is None: 

1356 warnings.warn("Unable to estimate constrained GEE " 

1357 "parameters.", ConvergenceWarning) 

1358 return None 

1359 

1360 if bc_cov is not None: 

1361 y, bc_cov = self._handle_constraint(x, bc_cov) 

1362 if x is None: 

1363 warnings.warn("Unable to estimate constrained GEE " 

1364 "parameters.", ConvergenceWarning) 

1365 return None 

1366 

1367 scale = self.estimate_scale() 

1368 

1369 # kwargs to add to results instance, need to be available in __init__ 

1370 res_kwds = dict(cov_type=cov_type, 

1371 cov_robust=bcov, 

1372 cov_naive=ncov, 

1373 cov_robust_bc=bc_cov) 

1374 

1375 # The superclass constructor will multiply the covariance 

1376 # matrix argument bcov by scale, which we do not want, so we 

1377 # divide bcov by the scale parameter here 

1378 results = GEEResults(self, mean_params, bcov / scale, scale, 

1379 cov_type=cov_type, use_t=False, 

1380 attr_kwds=res_kwds) 

1381 

1382 # attributes not needed during results__init__ 

1383 results.fit_history = self._fit_history 

1384 self.fit_history = defaultdict(list) 

1385 results.score_norm = del_params 

1386 results.converged = (del_params < ctol) 

1387 results.cov_struct = self.cov_struct 

1388 results.params_niter = params_niter 

1389 results.first_dep_update = first_dep_update 

1390 results.ctol = ctol 

1391 results.maxiter = maxiter 

1392 

1393 # These will be copied over to subclasses when upgrading. 

1394 results._props = ["cov_type", "use_t", 

1395 "cov_params_default", "cov_robust", 

1396 "cov_naive", "cov_robust_bc", 

1397 "fit_history", 

1398 "score_norm", "converged", "cov_struct", 

1399 "params_niter", "first_dep_update", "ctol", 

1400 "maxiter"] 

1401 

1402 return GEEResultsWrapper(results) 

1403 

1404 def _update_regularized(self, params, pen_wt, scad_param, eps): 

1405 

1406 sn, hm = 0, 0 

1407 

1408 for i in range(self.num_group): 

1409 

1410 expval, _ = self.cached_means[i] 

1411 resid = self.endog_li[i] - expval 

1412 sdev = np.sqrt(self.family.variance(expval)) 

1413 

1414 ex = self.exog_li[i] * sdev[:, None]**2 

1415 rslt = self.cov_struct.covariance_matrix_solve( 

1416 expval, i, sdev, (resid, ex)) 

1417 sn0 = rslt[0] 

1418 sn += np.dot(ex.T, sn0) 

1419 hm0 = rslt[1] 

1420 hm += np.dot(ex.T, hm0) 

1421 

1422 # Wang et al. divide sn here by num_group, but that 

1423 # seems to be incorrect 

1424 

1425 ap = np.abs(params) 

1426 clipped = np.clip(scad_param * pen_wt - ap, 0, np.inf) 

1427 en = pen_wt * clipped * (ap > pen_wt) 

1428 en /= (scad_param - 1) * pen_wt 

1429 en += pen_wt * (ap <= pen_wt) 

1430 en /= eps + ap 

1431 

1432 hm.flat[::hm.shape[0] + 1] += self.num_group * en 

1433 sn -= self.num_group * en * params 

1434 update = np.linalg.solve(hm, sn) 

1435 hm *= self.estimate_scale() 

1436 

1437 return update, hm 

1438 

1439 def _regularized_covmat(self, mean_params): 

1440 

1441 self.update_cached_means(mean_params) 

1442 

1443 ma = 0 

1444 

1445 for i in range(self.num_group): 

1446 

1447 expval, _ = self.cached_means[i] 

1448 resid = self.endog_li[i] - expval 

1449 sdev = np.sqrt(self.family.variance(expval)) 

1450 

1451 ex = self.exog_li[i] * sdev[:, None]**2 

1452 rslt = self.cov_struct.covariance_matrix_solve( 

1453 expval, i, sdev, (resid,)) 

1454 ma0 = np.dot(ex.T, rslt[0]) 

1455 ma += np.outer(ma0, ma0) 

1456 

1457 return ma 

1458 

1459 def fit_regularized(self, pen_wt, scad_param=3.7, maxiter=100, 

1460 ddof_scale=None, update_assoc=5, 

1461 ctol=1e-5, ztol=1e-3, eps=1e-6, scale=None): 

1462 """ 

1463 Regularized estimation for GEE. 

1464 

1465 Parameters 

1466 ---------- 

1467 pen_wt : float 

1468 The penalty weight (a non-negative scalar). 

1469 scad_param : float 

1470 Non-negative scalar determining the shape of the Scad 

1471 penalty. 

1472 maxiter : int 

1473 The maximum number of iterations. 

1474 ddof_scale : int 

1475 Value to subtract from `nobs` when calculating the 

1476 denominator degrees of freedom for t-statistics, defaults 

1477 to the number of columns in `exog`. 

1478 update_assoc : int 

1479 The dependence parameters are updated every `update_assoc` 

1480 iterations of the mean structure parameter updates. 

1481 ctol : float 

1482 Convergence criterion, default is one order of magnitude 

1483 smaller than proposed in section 3.1 of Wang et al. 

1484 ztol : float 

1485 Coefficients smaller than this value are treated as 

1486 being zero, default is based on section 5 of Wang et al. 

1487 eps : non-negative scalar 

1488 Numerical constant, see section 3.2 of Wang et al. 

1489 scale : float or string 

1490 If a float, this value is used as the scale parameter. 

1491 If "X2", the scale parameter is always estimated using 

1492 Pearson's chi-square method (e.g. as in a quasi-Poisson 

1493 analysis). If None, the default approach for the family 

1494 is used to estimate the scale parameter. 

1495 

1496 Returns 

1497 ------- 

1498 GEEResults instance. Note that not all methods of the results 

1499 class make sense when the model has been fit with regularization. 

1500 

1501 Notes 

1502 ----- 

1503 This implementation assumes that the link is canonical. 

1504 

1505 References 

1506 ---------- 

1507 Wang L, Zhou J, Qu A. (2012). Penalized generalized estimating 

1508 equations for high-dimensional longitudinal data analysis. 

1509 Biometrics. 2012 Jun;68(2):353-60. 

1510 doi: 10.1111/j.1541-0420.2011.01678.x. 

1511 https://www.ncbi.nlm.nih.gov/pubmed/21955051 

1512 http://users.stat.umn.edu/~wangx346/research/GEE_selection.pdf 

1513 """ 

1514 

1515 self.scaletype = scale 

1516 

1517 mean_params = np.zeros(self.exog.shape[1]) 

1518 self.update_cached_means(mean_params) 

1519 converged = False 

1520 fit_history = defaultdict(list) 

1521 

1522 # Subtract this number from the total sample size when 

1523 # normalizing the scale parameter estimate. 

1524 if ddof_scale is None: 

1525 self.ddof_scale = self.exog.shape[1] 

1526 else: 

1527 if not ddof_scale >= 0: 

1528 raise ValueError( 

1529 "ddof_scale must be a non-negative number or None") 

1530 self.ddof_scale = ddof_scale 

1531 

1532 # Keep this private for now. In some cases the early steps are 

1533 # very small so it seems necessary to ensure a certain minimum 

1534 # number of iterations before testing for convergence. 

1535 miniter = 20 

1536 

1537 for itr in range(maxiter): 

1538 

1539 update, hm = self._update_regularized( 

1540 mean_params, pen_wt, scad_param, eps) 

1541 if update is None: 

1542 msg = "Singular matrix encountered in regularized GEE update", 

1543 warnings.warn(msg, ConvergenceWarning) 

1544 break 

1545 if itr > miniter and np.sqrt(np.sum(update**2)) < ctol: 

1546 converged = True 

1547 break 

1548 mean_params += update 

1549 fit_history['params'].append(mean_params.copy()) 

1550 self.update_cached_means(mean_params) 

1551 

1552 if itr != 0 and (itr % update_assoc == 0): 

1553 self._update_assoc(mean_params) 

1554 

1555 if not converged: 

1556 msg = "GEE.fit_regularized did not converge" 

1557 warnings.warn(msg) 

1558 

1559 mean_params[np.abs(mean_params) < ztol] = 0 

1560 

1561 self._update_assoc(mean_params) 

1562 ma = self._regularized_covmat(mean_params) 

1563 cov = np.linalg.solve(hm, ma) 

1564 cov = np.linalg.solve(hm, cov.T) 

1565 

1566 # kwargs to add to results instance, need to be available in __init__ 

1567 res_kwds = dict(cov_type="robust", cov_robust=cov) 

1568 

1569 scale = self.estimate_scale() 

1570 rslt = GEEResults(self, mean_params, cov, scale, 

1571 regularized=True, attr_kwds=res_kwds) 

1572 rslt.fit_history = fit_history 

1573 

1574 return GEEResultsWrapper(rslt) 

1575 

1576 def _handle_constraint(self, mean_params, bcov): 

1577 """ 

1578 Expand the parameter estimate `mean_params` and covariance matrix 

1579 `bcov` to the coordinate system of the unconstrained model. 

1580 

1581 Parameters 

1582 ---------- 

1583 mean_params : array_like 

1584 A parameter vector estimate for the reduced model. 

1585 bcov : array_like 

1586 The covariance matrix of mean_params. 

1587 

1588 Returns 

1589 ------- 

1590 mean_params : array_like 

1591 The input parameter vector mean_params, expanded to the 

1592 coordinate system of the full model 

1593 bcov : array_like 

1594 The input covariance matrix bcov, expanded to the 

1595 coordinate system of the full model 

1596 """ 

1597 

1598 # The number of variables in the full model 

1599 red_p = len(mean_params) 

1600 full_p = self.constraint.lhs.shape[1] 

1601 mean_params0 = np.r_[mean_params, np.zeros(full_p - red_p)] 

1602 

1603 # Get the score vector under the full model. 

1604 save_exog_li = self.exog_li 

1605 self.exog_li = self.constraint.exog_fulltrans_li 

1606 import copy 

1607 save_cached_means = copy.deepcopy(self.cached_means) 

1608 self.update_cached_means(mean_params0) 

1609 _, score = self._update_mean_params() 

1610 

1611 if score is None: 

1612 warnings.warn("Singular matrix encountered in GEE score test", 

1613 ConvergenceWarning) 

1614 return None, None 

1615 

1616 _, ncov1, cmat = self._covmat() 

1617 scale = self.estimate_scale() 

1618 cmat = cmat / scale ** 2 

1619 score2 = score[red_p:] / scale 

1620 amat = np.linalg.inv(ncov1) 

1621 

1622 bmat_11 = cmat[0:red_p, 0:red_p] 

1623 bmat_22 = cmat[red_p:, red_p:] 

1624 bmat_12 = cmat[0:red_p, red_p:] 

1625 amat_11 = amat[0:red_p, 0:red_p] 

1626 amat_12 = amat[0:red_p, red_p:] 

1627 

1628 score_cov = bmat_22 - np.dot(amat_12.T, 

1629 np.linalg.solve(amat_11, bmat_12)) 

1630 score_cov -= np.dot(bmat_12.T, 

1631 np.linalg.solve(amat_11, amat_12)) 

1632 score_cov += np.dot(amat_12.T, 

1633 np.dot(np.linalg.solve(amat_11, bmat_11), 

1634 np.linalg.solve(amat_11, amat_12))) 

1635 

1636 from scipy.stats.distributions import chi2 

1637 score_statistic = np.dot(score2, 

1638 np.linalg.solve(score_cov, score2)) 

1639 score_df = len(score2) 

1640 score_pvalue = 1 - chi2.cdf(score_statistic, score_df) 

1641 self.score_test_results = {"statistic": score_statistic, 

1642 "df": score_df, 

1643 "p-value": score_pvalue} 

1644 

1645 mean_params = self.constraint.unpack_param(mean_params) 

1646 bcov = self.constraint.unpack_cov(bcov) 

1647 

1648 self.exog_li = save_exog_li 

1649 self.cached_means = save_cached_means 

1650 self.exog = self.constraint.restore_exog() 

1651 

1652 return mean_params, bcov 

1653 

1654 def _update_assoc(self, params): 

1655 """ 

1656 Update the association parameters 

1657 """ 

1658 

1659 self.cov_struct.update(params) 

1660 

1661 def _derivative_exog(self, params, exog=None, transform='dydx', 

1662 dummy_idx=None, count_idx=None): 

1663 """ 

1664 For computing marginal effects, returns dF(XB) / dX where F(.) 

1665 is the fitted mean. 

1666 

1667 transform can be 'dydx', 'dyex', 'eydx', or 'eyex'. 

1668 

1669 Not all of these make sense in the presence of discrete regressors, 

1670 but checks are done in the results in get_margeff. 

1671 """ 

1672 # This form should be appropriate for group 1 probit, logit, 

1673 # logistic, cloglog, heckprob, xtprobit. 

1674 offset_exposure = None 

1675 if exog is None: 

1676 exog = self.exog 

1677 offset_exposure = self._offset_exposure 

1678 

1679 margeff = self.mean_deriv_exog(exog, params, offset_exposure) 

1680 

1681 if 'ex' in transform: 

1682 margeff *= exog 

1683 if 'ey' in transform: 

1684 margeff /= self.predict(params, exog)[:, None] 

1685 if count_idx is not None: 

1686 from statsmodels.discrete.discrete_margins import ( 

1687 _get_count_effects) 

1688 margeff = _get_count_effects(margeff, exog, count_idx, transform, 

1689 self, params) 

1690 if dummy_idx is not None: 

1691 from statsmodels.discrete.discrete_margins import ( 

1692 _get_dummy_effects) 

1693 margeff = _get_dummy_effects(margeff, exog, dummy_idx, transform, 

1694 self, params) 

1695 return margeff 

1696 

1697 def qic(self, params, scale, cov_params): 

1698 """ 

1699 Returns quasi-information criteria and quasi-likelihood values. 

1700 

1701 Parameters 

1702 ---------- 

1703 params : array_like 

1704 The GEE estimates of the regression parameters. 

1705 scale : scalar 

1706 Estimated scale parameter 

1707 cov_params : array_like 

1708 An estimate of the covariance matrix for the 

1709 model parameters. Conventionally this is the robust 

1710 covariance matrix. 

1711 

1712 Returns 

1713 ------- 

1714 ql : scalar 

1715 The quasi-likelihood value 

1716 qic : scalar 

1717 A QIC that can be used to compare the mean and covariance 

1718 structures of the model. 

1719 qicu : scalar 

1720 A simplified QIC that can be used to compare mean structures 

1721 but not covariance structures 

1722 

1723 Notes 

1724 ----- 

1725 The quasi-likelihood used here is obtained by numerically evaluating 

1726 Wedderburn's integral representation of the quasi-likelihood function. 

1727 This approach is valid for all families and links. Many other 

1728 packages use analytical expressions for quasi-likelihoods that are 

1729 valid in special cases where the link function is canonical. These 

1730 analytical expressions may omit additive constants that only depend 

1731 on the data. Therefore, the numerical values of our QL and QIC values 

1732 will differ from the values reported by other packages. However only 

1733 the differences between two QIC values calculated for different models 

1734 using the same data are meaningful. Our QIC should produce the same 

1735 QIC differences as other software. 

1736 

1737 When using the QIC for models with unknown scale parameter, use a 

1738 common estimate of the scale parameter for all models being compared. 

1739 

1740 References 

1741 ---------- 

1742 .. [*] W. Pan (2001). Akaike's information criterion in generalized 

1743 estimating equations. Biometrics (57) 1. 

1744 """ 

1745 

1746 varfunc = self.family.variance 

1747 

1748 means = [] 

1749 omega = 0.0 

1750 # omega^-1 is the model-based covariance assuming independence 

1751 

1752 for i in range(self.num_group): 

1753 expval, lpr = self.cached_means[i] 

1754 means.append(expval) 

1755 dmat = self.mean_deriv(self.exog_li[i], lpr) 

1756 omega += np.dot(dmat.T, dmat) / scale 

1757 

1758 means = np.concatenate(means) 

1759 

1760 # The quasi-likelihood, use change of variables so the integration is 

1761 # from -1 to 1. 

1762 du = means - self.endog 

1763 nstep = 10000 

1764 qv = np.empty(nstep) 

1765 xv = np.linspace(-0.99999, 1, nstep) 

1766 for i, g in enumerate(xv): 

1767 u = self.endog + (g + 1) * du / 2.0 

1768 vu = varfunc(u) 

1769 qv[i] = -np.sum(du**2 * (g + 1) / vu) 

1770 qv /= (4 * scale) 

1771 

1772 from scipy.integrate import trapz 

1773 ql = trapz(qv, dx=xv[1] - xv[0]) 

1774 

1775 qicu = -2 * ql + 2 * self.exog.shape[1] 

1776 qic = -2 * ql + 2 * np.trace(np.dot(omega, cov_params)) 

1777 

1778 return ql, qic, qicu 

1779 

1780 

1781class GEEResults(base.LikelihoodModelResults): 

1782 

1783 __doc__ = ( 

1784 "This class summarizes the fit of a marginal regression model " 

1785 "using GEE.\n" + _gee_results_doc) 

1786 

1787 def __init__(self, model, params, cov_params, scale, 

1788 cov_type='robust', use_t=False, regularized=False, 

1789 **kwds): 

1790 

1791 super(GEEResults, self).__init__( 

1792 model, params, normalized_cov_params=cov_params, 

1793 scale=scale) 

1794 

1795 # not added by super 

1796 self.df_resid = model.df_resid 

1797 self.df_model = model.df_model 

1798 self.family = model.family 

1799 

1800 attr_kwds = kwds.pop('attr_kwds', {}) 

1801 self.__dict__.update(attr_kwds) 

1802 

1803 # we do not do this if the cov_type has already been set 

1804 # subclasses can set it through attr_kwds 

1805 if not (hasattr(self, 'cov_type') and 

1806 hasattr(self, 'cov_params_default')): 

1807 self.cov_type = cov_type # keep alias 

1808 covariance_type = self.cov_type.lower() 

1809 allowed_covariances = ["robust", "naive", "bias_reduced"] 

1810 if covariance_type not in allowed_covariances: 

1811 msg = ("GEE: `cov_type` must be one of " + 

1812 ", ".join(allowed_covariances)) 

1813 raise ValueError(msg) 

1814 

1815 if cov_type == "robust": 

1816 cov = self.cov_robust 

1817 elif cov_type == "naive": 

1818 cov = self.cov_naive 

1819 elif cov_type == "bias_reduced": 

1820 cov = self.cov_robust_bc 

1821 

1822 self.cov_params_default = cov 

1823 else: 

1824 if self.cov_type != cov_type: 

1825 raise ValueError('cov_type in argument is different from ' 

1826 'already attached cov_type') 

1827 

1828 def standard_errors(self, cov_type="robust"): 

1829 """ 

1830 This is a convenience function that returns the standard 

1831 errors for any covariance type. The value of `bse` is the 

1832 standard errors for whichever covariance type is specified as 

1833 an argument to `fit` (defaults to "robust"). 

1834 

1835 Parameters 

1836 ---------- 

1837 cov_type : str 

1838 One of "robust", "naive", or "bias_reduced". Determines 

1839 the covariance used to compute standard errors. Defaults 

1840 to "robust". 

1841 """ 

1842 

1843 # Check covariance_type 

1844 covariance_type = cov_type.lower() 

1845 allowed_covariances = ["robust", "naive", "bias_reduced"] 

1846 if covariance_type not in allowed_covariances: 

1847 msg = ("GEE: `covariance_type` must be one of " + 

1848 ", ".join(allowed_covariances)) 

1849 raise ValueError(msg) 

1850 

1851 if covariance_type == "robust": 

1852 return np.sqrt(np.diag(self.cov_robust)) 

1853 elif covariance_type == "naive": 

1854 return np.sqrt(np.diag(self.cov_naive)) 

1855 elif covariance_type == "bias_reduced": 

1856 if self.cov_robust_bc is None: 

1857 raise ValueError( 

1858 "GEE: `bias_reduced` covariance not available") 

1859 return np.sqrt(np.diag(self.cov_robust_bc)) 

1860 

1861 # Need to override to allow for different covariance types. 

1862 @cache_readonly 

1863 def bse(self): 

1864 return self.standard_errors(self.cov_type) 

1865 

1866 @cache_readonly 

1867 def resid(self): 

1868 """ 

1869 Returns the residuals, the endogeneous data minus the fitted 

1870 values from the model. 

1871 """ 

1872 return self.model.endog - self.fittedvalues 

1873 

1874 def score_test(self): 

1875 """ 

1876 Return the results of a score test for a linear constraint. 

1877 

1878 Returns 

1879 ------- 

1880 Adictionary containing the p-value, the test statistic, 

1881 and the degrees of freedom for the score test. 

1882 

1883 Notes 

1884 ----- 

1885 See also GEE.compare_score_test for an alternative way to perform 

1886 a score test. GEEResults.score_test is more general, in that it 

1887 supports testing arbitrary linear equality constraints. However 

1888 GEE.compare_score_test might be easier to use when comparing 

1889 two explicit models. 

1890 

1891 References 

1892 ---------- 

1893 Xu Guo and Wei Pan (2002). "Small sample performance of the score 

1894 test in GEE". 

1895 http://www.sph.umn.edu/faculty1/wp-content/uploads/2012/11/rr2002-013.pdf 

1896 """ 

1897 

1898 if not hasattr(self.model, "score_test_results"): 

1899 msg = "score_test on results instance only available when " 

1900 msg += " model was fit with constraints" 

1901 raise ValueError(msg) 

1902 

1903 return self.model.score_test_results 

1904 

1905 @cache_readonly 

1906 def resid_split(self): 

1907 """ 

1908 Returns the residuals, the endogeneous data minus the fitted 

1909 values from the model. The residuals are returned as a list 

1910 of arrays containing the residuals for each cluster. 

1911 """ 

1912 sresid = [] 

1913 for v in self.model.group_labels: 

1914 ii = self.model.group_indices[v] 

1915 sresid.append(self.resid[ii]) 

1916 return sresid 

1917 

1918 @cache_readonly 

1919 def resid_centered(self): 

1920 """ 

1921 Returns the residuals centered within each group. 

1922 """ 

1923 cresid = self.resid.copy() 

1924 for v in self.model.group_labels: 

1925 ii = self.model.group_indices[v] 

1926 cresid[ii] -= cresid[ii].mean() 

1927 return cresid 

1928 

1929 @cache_readonly 

1930 def resid_centered_split(self): 

1931 """ 

1932 Returns the residuals centered within each group. The 

1933 residuals are returned as a list of arrays containing the 

1934 centered residuals for each cluster. 

1935 """ 

1936 sresid = [] 

1937 for v in self.model.group_labels: 

1938 ii = self.model.group_indices[v] 

1939 sresid.append(self.centered_resid[ii]) 

1940 return sresid 

1941 

1942 def qic(self, scale=None): 

1943 """ 

1944 Returns the QIC and QICu information criteria. 

1945 

1946 For families with a scale parameter (e.g. Gaussian), provide 

1947 as the scale argument the estimated scale from the largest 

1948 model under consideration. 

1949 

1950 If the scale parameter is not provided, the estimated scale 

1951 parameter is used. Doing this does not allow comparisons of 

1952 QIC values between models. 

1953 """ 

1954 

1955 # It is easy to forget to set the scale parameter. Sometimes 

1956 # this is intentional, so we warn. 

1957 if scale is None: 

1958 warnings.warn("QIC values obtained using scale=None are not " 

1959 "appropriate for comparing models") 

1960 

1961 if scale is None: 

1962 scale = self.scale 

1963 

1964 _, qic, qicu = self.model.qic(self.params, scale, 

1965 self.cov_params()) 

1966 

1967 return qic, qicu 

1968 

1969 # FIXME: alias to be removed, temporary backwards compatibility 

1970 split_resid = resid_split 

1971 centered_resid = resid_centered 

1972 split_centered_resid = resid_centered_split 

1973 

1974 @cache_readonly 

1975 def resid_response(self): 

1976 return self.model.endog - self.fittedvalues 

1977 

1978 @cache_readonly 

1979 def resid_pearson(self): 

1980 val = self.model.endog - self.fittedvalues 

1981 val = val / np.sqrt(self.family.variance(self.fittedvalues)) 

1982 return val 

1983 

1984 @cache_readonly 

1985 def resid_working(self): 

1986 val = self.resid_response 

1987 val = val * self.family.link.deriv(self.fittedvalues) 

1988 return val 

1989 

1990 @cache_readonly 

1991 def resid_anscombe(self): 

1992 return self.family.resid_anscombe(self.model.endog, self.fittedvalues) 

1993 

1994 @cache_readonly 

1995 def resid_deviance(self): 

1996 return self.family.resid_dev(self.model.endog, self.fittedvalues) 

1997 

1998 @cache_readonly 

1999 def fittedvalues(self): 

2000 """ 

2001 Returns the fitted values from the model. 

2002 """ 

2003 return self.model.family.link.inverse(np.dot(self.model.exog, 

2004 self.params)) 

2005 

2006 @Appender(_plot_added_variable_doc % {'extra_params_doc': ''}) 

2007 def plot_added_variable(self, focus_exog, resid_type=None, 

2008 use_glm_weights=True, fit_kwargs=None, 

2009 ax=None): 

2010 

2011 from statsmodels.graphics.regressionplots import plot_added_variable 

2012 

2013 fig = plot_added_variable(self, focus_exog, 

2014 resid_type=resid_type, 

2015 use_glm_weights=use_glm_weights, 

2016 fit_kwargs=fit_kwargs, ax=ax) 

2017 

2018 return fig 

2019 

2020 @Appender(_plot_partial_residuals_doc % {'extra_params_doc': ''}) 

2021 def plot_partial_residuals(self, focus_exog, ax=None): 

2022 

2023 from statsmodels.graphics.regressionplots import plot_partial_residuals 

2024 

2025 return plot_partial_residuals(self, focus_exog, ax=ax) 

2026 

2027 @Appender(_plot_ceres_residuals_doc % {'extra_params_doc': ''}) 

2028 def plot_ceres_residuals(self, focus_exog, frac=0.66, cond_means=None, 

2029 ax=None): 

2030 

2031 from statsmodels.graphics.regressionplots import plot_ceres_residuals 

2032 

2033 return plot_ceres_residuals(self, focus_exog, frac, 

2034 cond_means=cond_means, ax=ax) 

2035 

2036 def conf_int(self, alpha=.05, cols=None, cov_type=None): 

2037 """ 

2038 Returns confidence intervals for the fitted parameters. 

2039 

2040 Parameters 

2041 ---------- 

2042 alpha : float, optional 

2043 The `alpha` level for the confidence interval. i.e., The 

2044 default `alpha` = .05 returns a 95% confidence interval. 

2045 cols : array_like, optional 

2046 `cols` specifies which confidence intervals to return 

2047 cov_type : str 

2048 The covariance type used for computing standard errors; 

2049 must be one of 'robust', 'naive', and 'bias reduced'. 

2050 See `GEE` for details. 

2051 

2052 Notes 

2053 ----- 

2054 The confidence interval is based on the Gaussian distribution. 

2055 """ 

2056 # super does not allow to specify cov_type and method is not 

2057 # implemented, 

2058 # FIXME: remove this method here 

2059 if cov_type is None: 

2060 bse = self.bse 

2061 else: 

2062 bse = self.standard_errors(cov_type=cov_type) 

2063 params = self.params 

2064 dist = stats.norm 

2065 q = dist.ppf(1 - alpha / 2) 

2066 

2067 if cols is None: 

2068 lower = self.params - q * bse 

2069 upper = self.params + q * bse 

2070 else: 

2071 cols = np.asarray(cols) 

2072 lower = params[cols] - q * bse[cols] 

2073 upper = params[cols] + q * bse[cols] 

2074 return np.asarray(lzip(lower, upper)) 

2075 

2076 def summary(self, yname=None, xname=None, title=None, alpha=.05): 

2077 """ 

2078 Summarize the GEE regression results 

2079 

2080 Parameters 

2081 ---------- 

2082 yname : str, optional 

2083 Default is `y` 

2084 xname : list[str], optional 

2085 Names for the exogenous variables, default is `var_#` for ## in 

2086 the number of regressors. Must match the number of parameters in 

2087 the model 

2088 title : str, optional 

2089 Title for the top table. If not None, then this replaces 

2090 the default title 

2091 alpha : float 

2092 significance level for the confidence intervals 

2093 cov_type : str 

2094 The covariance type used to compute the standard errors; 

2095 one of 'robust' (the usual robust sandwich-type covariance 

2096 estimate), 'naive' (ignores dependence), and 'bias 

2097 reduced' (the Mancl/DeRouen estimate). 

2098 

2099 Returns 

2100 ------- 

2101 smry : Summary instance 

2102 this holds the summary tables and text, which can be 

2103 printed or converted to various output formats. 

2104 

2105 See Also 

2106 -------- 

2107 statsmodels.iolib.summary.Summary : class to hold summary results 

2108 """ 

2109 

2110 top_left = [('Dep. Variable:', None), 

2111 ('Model:', None), 

2112 ('Method:', ['Generalized']), 

2113 ('', ['Estimating Equations']), 

2114 ('Family:', [self.model.family.__class__.__name__]), 

2115 ('Dependence structure:', 

2116 [self.model.cov_struct.__class__.__name__]), 

2117 ('Date:', None), 

2118 ('Covariance type: ', [self.cov_type, ]) 

2119 ] 

2120 

2121 NY = [len(y) for y in self.model.endog_li] 

2122 

2123 top_right = [('No. Observations:', [sum(NY)]), 

2124 ('No. clusters:', [len(self.model.endog_li)]), 

2125 ('Min. cluster size:', [min(NY)]), 

2126 ('Max. cluster size:', [max(NY)]), 

2127 ('Mean cluster size:', ["%.1f" % np.mean(NY)]), 

2128 ('Num. iterations:', ['%d' % 

2129 len(self.fit_history['params'])]), 

2130 ('Scale:', ["%.3f" % self.scale]), 

2131 ('Time:', None), 

2132 ] 

2133 

2134 # The skew of the residuals 

2135 skew1 = stats.skew(self.resid) 

2136 kurt1 = stats.kurtosis(self.resid) 

2137 skew2 = stats.skew(self.centered_resid) 

2138 kurt2 = stats.kurtosis(self.centered_resid) 

2139 

2140 diagn_left = [('Skew:', ["%12.4f" % skew1]), 

2141 ('Centered skew:', ["%12.4f" % skew2])] 

2142 

2143 diagn_right = [('Kurtosis:', ["%12.4f" % kurt1]), 

2144 ('Centered kurtosis:', ["%12.4f" % kurt2]) 

2145 ] 

2146 

2147 if title is None: 

2148 title = self.model.__class__.__name__ + ' ' +\ 

2149 "Regression Results" 

2150 

2151 # Override the exog variable names if xname is provided as an 

2152 # argument. 

2153 if xname is None: 

2154 xname = self.model.exog_names 

2155 

2156 if yname is None: 

2157 yname = self.model.endog_names 

2158 

2159 # Create summary table instance 

2160 from statsmodels.iolib.summary import Summary 

2161 smry = Summary() 

2162 smry.add_table_2cols(self, gleft=top_left, gright=top_right, 

2163 yname=yname, xname=xname, 

2164 title=title) 

2165 smry.add_table_params(self, yname=yname, xname=xname, 

2166 alpha=alpha, use_t=False) 

2167 smry.add_table_2cols(self, gleft=diagn_left, 

2168 gright=diagn_right, yname=yname, 

2169 xname=xname, title="") 

2170 

2171 return smry 

2172 

2173 def get_margeff(self, at='overall', method='dydx', atexog=None, 

2174 dummy=False, count=False): 

2175 """Get marginal effects of the fitted model. 

2176 

2177 Parameters 

2178 ---------- 

2179 at : str, optional 

2180 Options are: 

2181 

2182 - 'overall', The average of the marginal effects at each 

2183 observation. 

2184 - 'mean', The marginal effects at the mean of each regressor. 

2185 - 'median', The marginal effects at the median of each regressor. 

2186 - 'zero', The marginal effects at zero for each regressor. 

2187 - 'all', The marginal effects at each observation. If `at` is 'all' 

2188 only margeff will be available. 

2189 

2190 Note that if `exog` is specified, then marginal effects for all 

2191 variables not specified by `exog` are calculated using the `at` 

2192 option. 

2193 method : str, optional 

2194 Options are: 

2195 

2196 - 'dydx' - dy/dx - No transformation is made and marginal effects 

2197 are returned. This is the default. 

2198 - 'eyex' - estimate elasticities of variables in `exog` -- 

2199 d(lny)/d(lnx) 

2200 - 'dyex' - estimate semi-elasticity -- dy/d(lnx) 

2201 - 'eydx' - estimate semi-elasticity -- d(lny)/dx 

2202 

2203 Note that tranformations are done after each observation is 

2204 calculated. Semi-elasticities for binary variables are computed 

2205 using the midpoint method. 'dyex' and 'eyex' do not make sense 

2206 for discrete variables. 

2207 atexog : array_like, optional 

2208 Optionally, you can provide the exogenous variables over which to 

2209 get the marginal effects. This should be a dictionary with the key 

2210 as the zero-indexed column number and the value of the dictionary. 

2211 Default is None for all independent variables less the constant. 

2212 dummy : bool, optional 

2213 If False, treats binary variables (if present) as continuous. This 

2214 is the default. Else if True, treats binary variables as 

2215 changing from 0 to 1. Note that any variable that is either 0 or 1 

2216 is treated as binary. Each binary variable is treated separately 

2217 for now. 

2218 count : bool, optional 

2219 If False, treats count variables (if present) as continuous. This 

2220 is the default. Else if True, the marginal effect is the 

2221 change in probabilities when each observation is increased by one. 

2222 

2223 Returns 

2224 ------- 

2225 effects : ndarray 

2226 the marginal effect corresponding to the input options 

2227 

2228 Notes 

2229 ----- 

2230 When using after Poisson, returns the expected number of events 

2231 per period, assuming that the model is loglinear. 

2232 """ 

2233 

2234 if self.model.constraint is not None: 

2235 warnings.warn("marginal effects ignore constraints", 

2236 ValueWarning) 

2237 

2238 return GEEMargins(self, (at, method, atexog, dummy, count)) 

2239 

2240 def plot_isotropic_dependence(self, ax=None, xpoints=10, 

2241 min_n=50): 

2242 """ 

2243 Create a plot of the pairwise products of within-group 

2244 residuals against the corresponding time differences. This 

2245 plot can be used to assess the possible form of an isotropic 

2246 covariance structure. 

2247 

2248 Parameters 

2249 ---------- 

2250 ax : AxesSubplot 

2251 An axes on which to draw the graph. If None, new 

2252 figure and axes objects are created 

2253 xpoints : scalar or array_like 

2254 If scalar, the number of points equally spaced points on 

2255 the time difference axis used to define bins for 

2256 calculating local means. If an array, the specific points 

2257 that define the bins. 

2258 min_n : int 

2259 The minimum sample size in a bin for the mean residual 

2260 product to be included on the plot. 

2261 """ 

2262 

2263 from statsmodels.graphics import utils as gutils 

2264 

2265 resid = self.model.cluster_list(self.resid) 

2266 time = self.model.cluster_list(self.model.time) 

2267 

2268 # All within-group pairwise time distances (xdt) and the 

2269 # corresponding products of scaled residuals (xre). 

2270 xre, xdt = [], [] 

2271 for re, ti in zip(resid, time): 

2272 ix = np.tril_indices(re.shape[0], 0) 

2273 re = re[ix[0]] * re[ix[1]] / self.scale ** 2 

2274 xre.append(re) 

2275 dists = np.sqrt(((ti[ix[0], :] - ti[ix[1], :]) ** 2).sum(1)) 

2276 xdt.append(dists) 

2277 

2278 xre = np.concatenate(xre) 

2279 xdt = np.concatenate(xdt) 

2280 

2281 if ax is None: 

2282 fig, ax = gutils.create_mpl_ax(ax) 

2283 else: 

2284 fig = ax.get_figure() 

2285 

2286 # Convert to a correlation 

2287 ii = np.flatnonzero(xdt == 0) 

2288 v0 = np.mean(xre[ii]) 

2289 xre /= v0 

2290 

2291 # Use the simple average to smooth, since fancier smoothers 

2292 # that trim and downweight outliers give biased results (we 

2293 # need the actual mean of a skewed distribution). 

2294 if np.isscalar(xpoints): 

2295 xpoints = np.linspace(0, max(xdt), xpoints) 

2296 dg = np.digitize(xdt, xpoints) 

2297 dgu = np.unique(dg) 

2298 hist = np.asarray([np.sum(dg == k) for k in dgu]) 

2299 ii = np.flatnonzero(hist >= min_n) 

2300 dgu = dgu[ii] 

2301 dgy = np.asarray([np.mean(xre[dg == k]) for k in dgu]) 

2302 dgx = np.asarray([np.mean(xdt[dg == k]) for k in dgu]) 

2303 

2304 ax.plot(dgx, dgy, '-', color='orange', lw=5) 

2305 ax.set_xlabel("Time difference") 

2306 ax.set_ylabel("Product of scaled residuals") 

2307 

2308 return fig 

2309 

2310 def sensitivity_params(self, dep_params_first, 

2311 dep_params_last, num_steps): 

2312 """ 

2313 Refits the GEE model using a sequence of values for the 

2314 dependence parameters. 

2315 

2316 Parameters 

2317 ---------- 

2318 dep_params_first : array_like 

2319 The first dep_params in the sequence 

2320 dep_params_last : array_like 

2321 The last dep_params in the sequence 

2322 num_steps : int 

2323 The number of dep_params in the sequence 

2324 

2325 Returns 

2326 ------- 

2327 results : array_like 

2328 The GEEResults objects resulting from the fits. 

2329 """ 

2330 

2331 model = self.model 

2332 

2333 import copy 

2334 cov_struct = copy.deepcopy(self.model.cov_struct) 

2335 

2336 # We are fixing the dependence structure in each run. 

2337 update_dep = model.update_dep 

2338 model.update_dep = False 

2339 

2340 dep_params = [] 

2341 results = [] 

2342 for x in np.linspace(0, 1, num_steps): 

2343 

2344 dp = x * dep_params_last + (1 - x) * dep_params_first 

2345 dep_params.append(dp) 

2346 

2347 model.cov_struct = copy.deepcopy(cov_struct) 

2348 model.cov_struct.dep_params = dp 

2349 rslt = model.fit(start_params=self.params, 

2350 ctol=self.ctol, 

2351 params_niter=self.params_niter, 

2352 first_dep_update=self.first_dep_update, 

2353 cov_type=self.cov_type) 

2354 results.append(rslt) 

2355 

2356 model.update_dep = update_dep 

2357 

2358 return results 

2359 

2360 # FIXME: alias to be removed, temporary backwards compatibility 

2361 params_sensitivity = sensitivity_params 

2362 

2363 

2364class GEEResultsWrapper(lm.RegressionResultsWrapper): 

2365 _attrs = { 

2366 'centered_resid': 'rows', 

2367 } 

2368 _wrap_attrs = wrap.union_dicts(lm.RegressionResultsWrapper._wrap_attrs, 

2369 _attrs) 

2370wrap.populate_wrapper(GEEResultsWrapper, GEEResults) # noqa:E305 

2371 

2372 

2373class OrdinalGEE(GEE): 

2374 

2375 __doc__ = ( 

2376 " Ordinal Response Marginal Regression Model using GEE\n" + 

2377 _gee_init_doc % {'extra_params': base._missing_param_doc, 

2378 'family_doc': _gee_ordinal_family_doc, 

2379 'example': _gee_ordinal_example}) 

2380 

2381 def __init__(self, endog, exog, groups, time=None, family=None, 

2382 cov_struct=None, missing='none', offset=None, 

2383 dep_data=None, constraint=None, **kwargs): 

2384 

2385 if family is None: 

2386 family = families.Binomial() 

2387 else: 

2388 if not isinstance(family, families.Binomial): 

2389 raise ValueError("ordinal GEE must use a Binomial family") 

2390 

2391 if cov_struct is None: 

2392 cov_struct = cov_structs.OrdinalIndependence() 

2393 

2394 endog, exog, groups, time, offset = self.setup_ordinal( 

2395 endog, exog, groups, time, offset) 

2396 

2397 super(OrdinalGEE, self).__init__(endog, exog, groups, time, 

2398 family, cov_struct, missing, 

2399 offset, dep_data, constraint) 

2400 

2401 def setup_ordinal(self, endog, exog, groups, time, offset): 

2402 """ 

2403 Restructure ordinal data as binary indicators so that they can 

2404 be analyzed using Generalized Estimating Equations. 

2405 """ 

2406 

2407 self.endog_orig = endog.copy() 

2408 self.exog_orig = exog.copy() 

2409 self.groups_orig = groups.copy() 

2410 if offset is not None: 

2411 self.offset_orig = offset.copy() 

2412 else: 

2413 self.offset_orig = None 

2414 offset = np.zeros(len(endog)) 

2415 if time is not None: 

2416 self.time_orig = time.copy() 

2417 else: 

2418 self.time_orig = None 

2419 time = np.zeros((len(endog), 1)) 

2420 

2421 exog = np.asarray(exog) 

2422 endog = np.asarray(endog) 

2423 groups = np.asarray(groups) 

2424 time = np.asarray(time) 

2425 offset = np.asarray(offset) 

2426 

2427 # The unique outcomes, except the greatest one. 

2428 self.endog_values = np.unique(endog) 

2429 endog_cuts = self.endog_values[0:-1] 

2430 ncut = len(endog_cuts) 

2431 

2432 nrows = ncut * len(endog) 

2433 exog_out = np.zeros((nrows, exog.shape[1]), 

2434 dtype=np.float64) 

2435 endog_out = np.zeros(nrows, dtype=np.float64) 

2436 intercepts = np.zeros((nrows, ncut), dtype=np.float64) 

2437 groups_out = np.zeros(nrows, dtype=groups.dtype) 

2438 time_out = np.zeros((nrows, time.shape[1]), 

2439 dtype=np.float64) 

2440 offset_out = np.zeros(nrows, dtype=np.float64) 

2441 

2442 jrow = 0 

2443 zipper = zip(exog, endog, groups, time, offset) 

2444 for (exog_row, endog_value, group_value, time_value, 

2445 offset_value) in zipper: 

2446 

2447 # Loop over thresholds for the indicators 

2448 for thresh_ix, thresh in enumerate(endog_cuts): 

2449 

2450 exog_out[jrow, :] = exog_row 

2451 endog_out[jrow] = (int(endog_value > thresh)) 

2452 intercepts[jrow, thresh_ix] = 1 

2453 groups_out[jrow] = group_value 

2454 time_out[jrow] = time_value 

2455 offset_out[jrow] = offset_value 

2456 jrow += 1 

2457 

2458 exog_out = np.concatenate((intercepts, exog_out), axis=1) 

2459 

2460 # exog column names, including intercepts 

2461 xnames = ["I(y>%.1f)" % v for v in endog_cuts] 

2462 if type(self.exog_orig) == pd.DataFrame: 

2463 xnames.extend(self.exog_orig.columns) 

2464 else: 

2465 xnames.extend(["x%d" % k for k in range(1, exog.shape[1] + 1)]) 

2466 exog_out = pd.DataFrame(exog_out, columns=xnames) 

2467 

2468 # Preserve the endog name if there is one 

2469 if type(self.endog_orig) == pd.Series: 

2470 endog_out = pd.Series(endog_out, name=self.endog_orig.name) 

2471 

2472 return endog_out, exog_out, groups_out, time_out, offset_out 

2473 

2474 def _starting_params(self): 

2475 model = GEE(self.endog, self.exog, self.groups, 

2476 time=self.time, family=families.Binomial(), 

2477 offset=self.offset, exposure=self.exposure) 

2478 result = model.fit() 

2479 return result.params 

2480 

2481 @Appender(_gee_fit_doc) 

2482 def fit(self, maxiter=60, ctol=1e-6, start_params=None, 

2483 params_niter=1, first_dep_update=0, 

2484 cov_type='robust'): 

2485 

2486 rslt = super(OrdinalGEE, self).fit(maxiter, ctol, start_params, 

2487 params_niter, first_dep_update, 

2488 cov_type=cov_type) 

2489 

2490 rslt = rslt._results # use unwrapped instance 

2491 res_kwds = dict(((k, getattr(rslt, k)) for k in rslt._props)) 

2492 # Convert the GEEResults to an OrdinalGEEResults 

2493 ord_rslt = OrdinalGEEResults(self, rslt.params, 

2494 rslt.cov_params() / rslt.scale, 

2495 rslt.scale, 

2496 cov_type=cov_type, 

2497 attr_kwds=res_kwds) 

2498 # for k in rslt._props: 

2499 # setattr(ord_rslt, k, getattr(rslt, k)) 

2500 # TODO: document or delete 

2501 

2502 return OrdinalGEEResultsWrapper(ord_rslt) 

2503 

2504 

2505class OrdinalGEEResults(GEEResults): 

2506 

2507 __doc__ = ( 

2508 "This class summarizes the fit of a marginal regression model" 

2509 "for an ordinal response using GEE.\n" 

2510 + _gee_results_doc) 

2511 

2512 def plot_distribution(self, ax=None, exog_values=None): 

2513 """ 

2514 Plot the fitted probabilities of endog in an ordinal model, 

2515 for specified values of the predictors. 

2516 

2517 Parameters 

2518 ---------- 

2519 ax : AxesSubplot 

2520 An axes on which to draw the graph. If None, new 

2521 figure and axes objects are created 

2522 exog_values : array_like 

2523 A list of dictionaries, with each dictionary mapping 

2524 variable names to values at which the variable is held 

2525 fixed. The values P(endog=y | exog) are plotted for all 

2526 possible values of y, at the given exog value. Variables 

2527 not included in a dictionary are held fixed at the mean 

2528 value. 

2529 

2530 Example: 

2531 -------- 

2532 We have a model with covariates 'age' and 'sex', and wish to 

2533 plot the probabilities P(endog=y | exog) for males (sex=0) and 

2534 for females (sex=1), as separate paths on the plot. Since 

2535 'age' is not included below in the map, it is held fixed at 

2536 its mean value. 

2537 

2538 >>> ev = [{"sex": 1}, {"sex": 0}] 

2539 >>> rslt.distribution_plot(exog_values=ev) 

2540 """ 

2541 

2542 from statsmodels.graphics import utils as gutils 

2543 

2544 if ax is None: 

2545 fig, ax = gutils.create_mpl_ax(ax) 

2546 else: 

2547 fig = ax.get_figure() 

2548 

2549 # If no covariate patterns are specified, create one with all 

2550 # variables set to their mean values. 

2551 if exog_values is None: 

2552 exog_values = [{}, ] 

2553 

2554 exog_means = self.model.exog.mean(0) 

2555 ix_icept = [i for i, x in enumerate(self.model.exog_names) if 

2556 x.startswith("I(")] 

2557 

2558 for ev in exog_values: 

2559 

2560 for k in ev.keys(): 

2561 if k not in self.model.exog_names: 

2562 raise ValueError("%s is not a variable in the model" 

2563 % k) 

2564 

2565 # Get the fitted probability for each level, at the given 

2566 # covariate values. 

2567 pr = [] 

2568 for j in ix_icept: 

2569 

2570 xp = np.zeros_like(self.params) 

2571 xp[j] = 1. 

2572 for i, vn in enumerate(self.model.exog_names): 

2573 if i in ix_icept: 

2574 continue 

2575 # User-specified value 

2576 if vn in ev: 

2577 xp[i] = ev[vn] 

2578 # Mean value 

2579 else: 

2580 xp[i] = exog_means[i] 

2581 

2582 p = 1 / (1 + np.exp(-np.dot(xp, self.params))) 

2583 pr.append(p) 

2584 

2585 pr.insert(0, 1) 

2586 pr.append(0) 

2587 pr = np.asarray(pr) 

2588 prd = -np.diff(pr) 

2589 

2590 ax.plot(self.model.endog_values, prd, 'o-') 

2591 

2592 ax.set_xlabel("Response value") 

2593 ax.set_ylabel("Probability") 

2594 ax.set_ylim(0, 1) 

2595 

2596 return fig 

2597 

2598 

2599def _score_test_submodel(par, sub): 

2600 """ 

2601 Return transformation matrices for design matrices. 

2602 

2603 Parameters 

2604 ---------- 

2605 par : instance 

2606 The parent model 

2607 sub : instance 

2608 The sub-model 

2609 

2610 Returns 

2611 ------- 

2612 qm : array_like 

2613 Matrix mapping the design matrix of the parent to the design matrix 

2614 for the sub-model. 

2615 qc : array_like 

2616 Matrix mapping the design matrix of the parent to the orthogonal 

2617 complement of the columnspace of the submodel in the columnspace 

2618 of the parent. 

2619 

2620 Notes 

2621 ----- 

2622 Returns None, None if the provided submodel is not actually a submodel. 

2623 """ 

2624 

2625 x1 = par.exog 

2626 x2 = sub.exog 

2627 

2628 u, s, vt = np.linalg.svd(x1, 0) 

2629 

2630 # Get the orthogonal complement of col(x2) in col(x1). 

2631 a, _, _ = np.linalg.svd(x2, 0) 

2632 a = u - np.dot(a, np.dot(a.T, u)) 

2633 x2c, sb, _ = np.linalg.svd(a, 0) 

2634 x2c = x2c[:, sb > 1e-12] 

2635 

2636 # x1 * qm = x2 

2637 qm = np.dot(vt.T, np.dot(u.T, x2) / s[:, None]) 

2638 

2639 e = np.max(np.abs(x2 - np.dot(x1, qm))) 

2640 if e > 1e-8: 

2641 return None, None 

2642 

2643 # x1 * qc = x2c 

2644 qc = np.dot(vt.T, np.dot(u.T, x2c) / s[:, None]) 

2645 

2646 return qm, qc 

2647 

2648 

2649class OrdinalGEEResultsWrapper(GEEResultsWrapper): 

2650 pass 

2651wrap.populate_wrapper(OrdinalGEEResultsWrapper, OrdinalGEEResults) # noqa:E305 

2652 

2653 

2654class NominalGEE(GEE): 

2655 

2656 __doc__ = ( 

2657 " Nominal Response Marginal Regression Model using GEE.\n" + 

2658 _gee_init_doc % {'extra_params': base._missing_param_doc, 

2659 'family_doc': _gee_nominal_family_doc, 

2660 'example': _gee_nominal_example}) 

2661 

2662 def __init__(self, endog, exog, groups, time=None, family=None, 

2663 cov_struct=None, missing='none', offset=None, 

2664 dep_data=None, constraint=None, **kwargs): 

2665 

2666 endog, exog, groups, time, offset = self.setup_nominal( 

2667 endog, exog, groups, time, offset) 

2668 

2669 if family is None: 

2670 family = _Multinomial(self.ncut + 1) 

2671 

2672 if cov_struct is None: 

2673 cov_struct = cov_structs.NominalIndependence() 

2674 

2675 super(NominalGEE, self).__init__( 

2676 endog, exog, groups, time, family, cov_struct, missing, 

2677 offset, dep_data, constraint) 

2678 

2679 def _starting_params(self): 

2680 model = GEE(self.endog, self.exog, self.groups, 

2681 time=self.time, family=families.Binomial(), 

2682 offset=self.offset, exposure=self.exposure) 

2683 result = model.fit() 

2684 return result.params 

2685 

2686 def setup_nominal(self, endog, exog, groups, time, offset): 

2687 """ 

2688 Restructure nominal data as binary indicators so that they can 

2689 be analyzed using Generalized Estimating Equations. 

2690 """ 

2691 

2692 self.endog_orig = endog.copy() 

2693 self.exog_orig = exog.copy() 

2694 self.groups_orig = groups.copy() 

2695 if offset is not None: 

2696 self.offset_orig = offset.copy() 

2697 else: 

2698 self.offset_orig = None 

2699 offset = np.zeros(len(endog)) 

2700 if time is not None: 

2701 self.time_orig = time.copy() 

2702 else: 

2703 self.time_orig = None 

2704 time = np.zeros((len(endog), 1)) 

2705 

2706 exog = np.asarray(exog) 

2707 endog = np.asarray(endog) 

2708 groups = np.asarray(groups) 

2709 time = np.asarray(time) 

2710 offset = np.asarray(offset) 

2711 

2712 # The unique outcomes, except the greatest one. 

2713 self.endog_values = np.unique(endog) 

2714 endog_cuts = self.endog_values[0:-1] 

2715 ncut = len(endog_cuts) 

2716 self.ncut = ncut 

2717 

2718 nrows = len(endog_cuts) * exog.shape[0] 

2719 ncols = len(endog_cuts) * exog.shape[1] 

2720 exog_out = np.zeros((nrows, ncols), dtype=np.float64) 

2721 endog_out = np.zeros(nrows, dtype=np.float64) 

2722 groups_out = np.zeros(nrows, dtype=np.float64) 

2723 time_out = np.zeros((nrows, time.shape[1]), 

2724 dtype=np.float64) 

2725 offset_out = np.zeros(nrows, dtype=np.float64) 

2726 

2727 jrow = 0 

2728 zipper = zip(exog, endog, groups, time, offset) 

2729 for (exog_row, endog_value, group_value, time_value, 

2730 offset_value) in zipper: 

2731 

2732 # Loop over thresholds for the indicators 

2733 for thresh_ix, thresh in enumerate(endog_cuts): 

2734 

2735 u = np.zeros(len(endog_cuts), dtype=np.float64) 

2736 u[thresh_ix] = 1 

2737 exog_out[jrow, :] = np.kron(u, exog_row) 

2738 endog_out[jrow] = (int(endog_value == thresh)) 

2739 groups_out[jrow] = group_value 

2740 time_out[jrow] = time_value 

2741 offset_out[jrow] = offset_value 

2742 jrow += 1 

2743 

2744 # exog names 

2745 if isinstance(self.exog_orig, pd.DataFrame): 

2746 xnames_in = self.exog_orig.columns 

2747 else: 

2748 xnames_in = ["x%d" % k for k in range(1, exog.shape[1] + 1)] 

2749 xnames = [] 

2750 for tr in endog_cuts: 

2751 xnames.extend(["%s[%.1f]" % (v, tr) for v in xnames_in]) 

2752 exog_out = pd.DataFrame(exog_out, columns=xnames) 

2753 exog_out = pd.DataFrame(exog_out, columns=xnames) 

2754 

2755 # Preserve endog name if there is one 

2756 if isinstance(self.endog_orig, pd.Series): 

2757 endog_out = pd.Series(endog_out, name=self.endog_orig.name) 

2758 

2759 return endog_out, exog_out, groups_out, time_out, offset_out 

2760 

2761 def mean_deriv(self, exog, lin_pred): 

2762 """ 

2763 Derivative of the expected endog with respect to the parameters. 

2764 

2765 Parameters 

2766 ---------- 

2767 exog : array_like 

2768 The exogeneous data at which the derivative is computed, 

2769 number of rows must be a multiple of `ncut`. 

2770 lin_pred : array_like 

2771 The values of the linear predictor, length must be multiple 

2772 of `ncut`. 

2773 

2774 Returns 

2775 ------- 

2776 The derivative of the expected endog with respect to the 

2777 parameters. 

2778 """ 

2779 

2780 expval = np.exp(lin_pred) 

2781 

2782 # Reshape so that each row contains all the indicators 

2783 # corresponding to one multinomial observation. 

2784 expval_m = np.reshape(expval, (len(expval) // self.ncut, 

2785 self.ncut)) 

2786 

2787 # The normalizing constant for the multinomial probabilities. 

2788 denom = 1 + expval_m.sum(1) 

2789 denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64)) 

2790 

2791 # The multinomial probabilities 

2792 mprob = expval / denom 

2793 

2794 # First term of the derivative: denom * expval' / denom^2 = 

2795 # expval' / denom. 

2796 dmat = mprob[:, None] * exog 

2797 

2798 # Second term of the derivative: -expval * denom' / denom^2 

2799 ddenom = expval[:, None] * exog 

2800 dmat -= mprob[:, None] * ddenom / denom[:, None] 

2801 

2802 return dmat 

2803 

2804 def mean_deriv_exog(self, exog, params, offset_exposure=None): 

2805 """ 

2806 Derivative of the expected endog with respect to exog for the 

2807 multinomial model, used in analyzing marginal effects. 

2808 

2809 Parameters 

2810 ---------- 

2811 exog : array_like 

2812 The exogeneous data at which the derivative is computed, 

2813 number of rows must be a multiple of `ncut`. 

2814 lpr : array_like 

2815 The linear predictor values, length must be multiple of 

2816 `ncut`. 

2817 

2818 Returns 

2819 ------- 

2820 The value of the derivative of the expected endog with respect 

2821 to exog. 

2822 

2823 Notes 

2824 ----- 

2825 offset_exposure must be set at None for the multinomial family. 

2826 """ 

2827 

2828 if offset_exposure is not None: 

2829 warnings.warn("Offset/exposure ignored for the multinomial family", 

2830 ValueWarning) 

2831 

2832 lpr = np.dot(exog, params) 

2833 expval = np.exp(lpr) 

2834 

2835 expval_m = np.reshape(expval, (len(expval) // self.ncut, 

2836 self.ncut)) 

2837 

2838 denom = 1 + expval_m.sum(1) 

2839 denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64)) 

2840 

2841 bmat0 = np.outer(np.ones(exog.shape[0]), params) 

2842 

2843 # Masking matrix 

2844 qmat = [] 

2845 for j in range(self.ncut): 

2846 ee = np.zeros(self.ncut, dtype=np.float64) 

2847 ee[j] = 1 

2848 qmat.append(np.kron(ee, np.ones(len(params) // self.ncut))) 

2849 qmat = np.array(qmat) 

2850 qmat = np.kron(np.ones((exog.shape[0] // self.ncut, 1)), qmat) 

2851 bmat = bmat0 * qmat 

2852 

2853 dmat = expval[:, None] * bmat / denom[:, None] 

2854 

2855 expval_mb = np.kron(expval_m, np.ones((self.ncut, 1))) 

2856 expval_mb = np.kron(expval_mb, np.ones((1, self.ncut))) 

2857 

2858 dmat -= expval[:, None] * (bmat * expval_mb) / denom[:, None] ** 2 

2859 

2860 return dmat 

2861 

2862 @Appender(_gee_fit_doc) 

2863 def fit(self, maxiter=60, ctol=1e-6, start_params=None, 

2864 params_niter=1, first_dep_update=0, 

2865 cov_type='robust'): 

2866 

2867 rslt = super(NominalGEE, self).fit(maxiter, ctol, start_params, 

2868 params_niter, first_dep_update, 

2869 cov_type=cov_type) 

2870 if rslt is None: 

2871 warnings.warn("GEE updates did not converge", 

2872 ConvergenceWarning) 

2873 return None 

2874 

2875 rslt = rslt._results # use unwrapped instance 

2876 res_kwds = dict(((k, getattr(rslt, k)) for k in rslt._props)) 

2877 # Convert the GEEResults to a NominalGEEResults 

2878 nom_rslt = NominalGEEResults(self, rslt.params, 

2879 rslt.cov_params() / rslt.scale, 

2880 rslt.scale, 

2881 cov_type=cov_type, 

2882 attr_kwds=res_kwds) 

2883 # TODO: document or delete 

2884 # for k in rslt._props: 

2885 # setattr(nom_rslt, k, getattr(rslt, k)) 

2886 

2887 return NominalGEEResultsWrapper(nom_rslt) 

2888 

2889 

2890class NominalGEEResults(GEEResults): 

2891 

2892 __doc__ = ( 

2893 "This class summarizes the fit of a marginal regression model" 

2894 "for a nominal response using GEE.\n" 

2895 + _gee_results_doc) 

2896 

2897 def plot_distribution(self, ax=None, exog_values=None): 

2898 """ 

2899 Plot the fitted probabilities of endog in an nominal model, 

2900 for specified values of the predictors. 

2901 

2902 Parameters 

2903 ---------- 

2904 ax : AxesSubplot 

2905 An axes on which to draw the graph. If None, new 

2906 figure and axes objects are created 

2907 exog_values : array_like 

2908 A list of dictionaries, with each dictionary mapping 

2909 variable names to values at which the variable is held 

2910 fixed. The values P(endog=y | exog) are plotted for all 

2911 possible values of y, at the given exog value. Variables 

2912 not included in a dictionary are held fixed at the mean 

2913 value. 

2914 

2915 Example: 

2916 -------- 

2917 We have a model with covariates 'age' and 'sex', and wish to 

2918 plot the probabilities P(endog=y | exog) for males (sex=0) and 

2919 for females (sex=1), as separate paths on the plot. Since 

2920 'age' is not included below in the map, it is held fixed at 

2921 its mean value. 

2922 

2923 >>> ex = [{"sex": 1}, {"sex": 0}] 

2924 >>> rslt.distribution_plot(exog_values=ex) 

2925 """ 

2926 

2927 from statsmodels.graphics import utils as gutils 

2928 

2929 if ax is None: 

2930 fig, ax = gutils.create_mpl_ax(ax) 

2931 else: 

2932 fig = ax.get_figure() 

2933 

2934 # If no covariate patterns are specified, create one with all 

2935 # variables set to their mean values. 

2936 if exog_values is None: 

2937 exog_values = [{}, ] 

2938 

2939 link = self.model.family.link.inverse 

2940 ncut = self.model.family.ncut 

2941 

2942 k = int(self.model.exog.shape[1] / ncut) 

2943 exog_means = self.model.exog.mean(0)[0:k] 

2944 exog_names = self.model.exog_names[0:k] 

2945 exog_names = [x.split("[")[0] for x in exog_names] 

2946 

2947 params = np.reshape(self.params, 

2948 (ncut, len(self.params) // ncut)) 

2949 

2950 for ev in exog_values: 

2951 

2952 exog = exog_means.copy() 

2953 

2954 for k in ev.keys(): 

2955 if k not in exog_names: 

2956 raise ValueError("%s is not a variable in the model" 

2957 % k) 

2958 

2959 ii = exog_names.index(k) 

2960 exog[ii] = ev[k] 

2961 

2962 lpr = np.dot(params, exog) 

2963 pr = link(lpr) 

2964 pr = np.r_[pr, 1 - pr.sum()] 

2965 

2966 ax.plot(self.model.endog_values, pr, 'o-') 

2967 

2968 ax.set_xlabel("Response value") 

2969 ax.set_ylabel("Probability") 

2970 ax.set_xticks(self.model.endog_values) 

2971 ax.set_xticklabels(self.model.endog_values) 

2972 ax.set_ylim(0, 1) 

2973 

2974 return fig 

2975 

2976 

2977class NominalGEEResultsWrapper(GEEResultsWrapper): 

2978 pass 

2979wrap.populate_wrapper(NominalGEEResultsWrapper, NominalGEEResults) # noqa:E305 

2980 

2981 

2982class _MultinomialLogit(Link): 

2983 """ 

2984 The multinomial logit transform, only for use with GEE. 

2985 

2986 Notes 

2987 ----- 

2988 The data are assumed coded as binary indicators, where each 

2989 observed multinomial value y is coded as I(y == S[0]), ..., I(y == 

2990 S[-1]), where S is the set of possible response labels, excluding 

2991 the largest one. Thererefore functions in this class should only 

2992 be called using vector argument whose length is a multiple of |S| 

2993 = ncut, which is an argument to be provided when initializing the 

2994 class. 

2995 

2996 call and derivative use a private method _clean to trim p by 1e-10 

2997 so that p is in (0, 1) 

2998 """ 

2999 

3000 def __init__(self, ncut): 

3001 self.ncut = ncut 

3002 

3003 def inverse(self, lpr): 

3004 """ 

3005 Inverse of the multinomial logit transform, which gives the 

3006 expected values of the data as a function of the linear 

3007 predictors. 

3008 

3009 Parameters 

3010 ---------- 

3011 lpr : array_like (length must be divisible by `ncut`) 

3012 The linear predictors 

3013 

3014 Returns 

3015 ------- 

3016 prob : ndarray 

3017 Probabilities, or expected values 

3018 """ 

3019 

3020 expval = np.exp(lpr) 

3021 

3022 denom = 1 + np.reshape(expval, (len(expval) // self.ncut, 

3023 self.ncut)).sum(1) 

3024 denom = np.kron(denom, np.ones(self.ncut, dtype=np.float64)) 

3025 

3026 prob = expval / denom 

3027 

3028 return prob 

3029 

3030 

3031class _Multinomial(families.Family): 

3032 """ 

3033 Pseudo-link function for fitting nominal multinomial models with 

3034 GEE. Not for use outside the GEE class. 

3035 """ 

3036 

3037 links = [_MultinomialLogit, ] 

3038 variance = varfuncs.binary 

3039 safe_links = [_MultinomialLogit, ] 

3040 

3041 def __init__(self, nlevels): 

3042 """ 

3043 Parameters 

3044 ---------- 

3045 nlevels : int 

3046 The number of distinct categories for the multinomial 

3047 distribution. 

3048 """ 

3049 self.initialize(nlevels) 

3050 

3051 def initialize(self, nlevels): 

3052 self.ncut = nlevels - 1 

3053 self.link = _MultinomialLogit(self.ncut) 

3054 

3055 

3056class GEEMargins(object): 

3057 """ 

3058 Estimated marginal effects for a regression model fit with GEE. 

3059 

3060 Parameters 

3061 ---------- 

3062 results : GEEResults instance 

3063 The results instance of a fitted discrete choice model 

3064 args : tuple 

3065 Args are passed to `get_margeff`. This is the same as 

3066 results.get_margeff. See there for more information. 

3067 kwargs : dict 

3068 Keyword args are passed to `get_margeff`. This is the same as 

3069 results.get_margeff. See there for more information. 

3070 """ 

3071 

3072 def __init__(self, results, args, kwargs={}): 

3073 self._cache = {} 

3074 self.results = results 

3075 self.get_margeff(*args, **kwargs) 

3076 

3077 def _reset(self): 

3078 self._cache = {} 

3079 

3080 @cache_readonly 

3081 def tvalues(self): 

3082 _check_at_is_all(self.margeff_options) 

3083 return self.margeff / self.margeff_se 

3084 

3085 def summary_frame(self, alpha=.05): 

3086 """ 

3087 Returns a DataFrame summarizing the marginal effects. 

3088 

3089 Parameters 

3090 ---------- 

3091 alpha : float 

3092 Number between 0 and 1. The confidence intervals have the 

3093 probability 1-alpha. 

3094 

3095 Returns 

3096 ------- 

3097 frame : DataFrames 

3098 A DataFrame summarizing the marginal effects. 

3099 """ 

3100 _check_at_is_all(self.margeff_options) 

3101 from pandas import DataFrame 

3102 names = [_transform_names[self.margeff_options['method']], 

3103 'Std. Err.', 'z', 'Pr(>|z|)', 

3104 'Conf. Int. Low', 'Cont. Int. Hi.'] 

3105 ind = self.results.model.exog.var(0) != 0 # True if not a constant 

3106 exog_names = self.results.model.exog_names 

3107 var_names = [name for i, name in enumerate(exog_names) if ind[i]] 

3108 table = np.column_stack((self.margeff, self.margeff_se, self.tvalues, 

3109 self.pvalues, self.conf_int(alpha))) 

3110 return DataFrame(table, columns=names, index=var_names) 

3111 

3112 @cache_readonly 

3113 def pvalues(self): 

3114 _check_at_is_all(self.margeff_options) 

3115 return stats.norm.sf(np.abs(self.tvalues)) * 2 

3116 

3117 def conf_int(self, alpha=.05): 

3118 """ 

3119 Returns the confidence intervals of the marginal effects 

3120 

3121 Parameters 

3122 ---------- 

3123 alpha : float 

3124 Number between 0 and 1. The confidence intervals have the 

3125 probability 1-alpha. 

3126 

3127 Returns 

3128 ------- 

3129 conf_int : ndarray 

3130 An array with lower, upper confidence intervals for the marginal 

3131 effects. 

3132 """ 

3133 _check_at_is_all(self.margeff_options) 

3134 me_se = self.margeff_se 

3135 q = stats.norm.ppf(1 - alpha / 2) 

3136 lower = self.margeff - q * me_se 

3137 upper = self.margeff + q * me_se 

3138 return np.asarray(lzip(lower, upper)) 

3139 

3140 def summary(self, alpha=.05): 

3141 """ 

3142 Returns a summary table for marginal effects 

3143 

3144 Parameters 

3145 ---------- 

3146 alpha : float 

3147 Number between 0 and 1. The confidence intervals have the 

3148 probability 1-alpha. 

3149 

3150 Returns 

3151 ------- 

3152 Summary : SummaryTable 

3153 A SummaryTable instance 

3154 """ 

3155 _check_at_is_all(self.margeff_options) 

3156 results = self.results 

3157 model = results.model 

3158 title = model.__class__.__name__ + " Marginal Effects" 

3159 method = self.margeff_options['method'] 

3160 top_left = [('Dep. Variable:', [model.endog_names]), 

3161 ('Method:', [method]), 

3162 ('At:', [self.margeff_options['at']]), ] 

3163 

3164 from statsmodels.iolib.summary import (Summary, summary_params, 

3165 table_extend) 

3166 exog_names = model.exog_names[:] # copy 

3167 smry = Summary() 

3168 

3169 const_idx = model.data.const_idx 

3170 if const_idx is not None: 

3171 exog_names.pop(const_idx) 

3172 

3173 J = int(getattr(model, "J", 1)) 

3174 if J > 1: 

3175 yname, yname_list = results._get_endog_name(model.endog_names, 

3176 None, all=True) 

3177 else: 

3178 yname = model.endog_names 

3179 yname_list = [yname] 

3180 

3181 smry.add_table_2cols(self, gleft=top_left, gright=[], 

3182 yname=yname, xname=exog_names, title=title) 

3183 

3184 # NOTE: add_table_params is not general enough yet for margeff 

3185 # could use a refactor with getattr instead of hard-coded params 

3186 # tvalues etc. 

3187 table = [] 

3188 conf_int = self.conf_int(alpha) 

3189 margeff = self.margeff 

3190 margeff_se = self.margeff_se 

3191 tvalues = self.tvalues 

3192 pvalues = self.pvalues 

3193 if J > 1: 

3194 for eq in range(J): 

3195 restup = (results, margeff[:, eq], margeff_se[:, eq], 

3196 tvalues[:, eq], pvalues[:, eq], conf_int[:, :, eq]) 

3197 tble = summary_params(restup, yname=yname_list[eq], 

3198 xname=exog_names, alpha=alpha, 

3199 use_t=False, 

3200 skip_header=True) 

3201 tble.title = yname_list[eq] 

3202 # overwrite coef with method name 

3203 header = ['', _transform_names[method], 'std err', 'z', 

3204 'P>|z|', 

3205 '[%3.1f%% Conf. Int.]' % (100 - alpha * 100)] 

3206 tble.insert_header_row(0, header) 

3207 # from IPython.core.debugger import Pdb; Pdb().set_trace() 

3208 table.append(tble) 

3209 

3210 table = table_extend(table, keep_headers=True) 

3211 else: 

3212 restup = (results, margeff, margeff_se, tvalues, pvalues, conf_int) 

3213 table = summary_params(restup, yname=yname, xname=exog_names, 

3214 alpha=alpha, use_t=False, skip_header=True) 

3215 header = ['', _transform_names[method], 'std err', 'z', 

3216 'P>|z|', '[%3.1f%% Conf. Int.]' % (100 - alpha * 100)] 

3217 table.insert_header_row(0, header) 

3218 

3219 smry.tables.append(table) 

3220 return smry 

3221 

3222 def get_margeff(self, at='overall', method='dydx', atexog=None, 

3223 dummy=False, count=False): 

3224 

3225 self._reset() # always reset the cache when this is called 

3226 # TODO: if at is not all or overall, we can also put atexog values 

3227 # in summary table head 

3228 method = method.lower() 

3229 at = at.lower() 

3230 _check_margeff_args(at, method) 

3231 self.margeff_options = dict(method=method, at=at) 

3232 results = self.results 

3233 model = results.model 

3234 params = results.params 

3235 exog = model.exog.copy() # copy because values are changed 

3236 effects_idx = exog.var(0) != 0 

3237 const_idx = model.data.const_idx 

3238 

3239 if dummy: 

3240 _check_discrete_args(at, method) 

3241 dummy_idx, dummy = _get_dummy_index(exog, const_idx) 

3242 else: 

3243 dummy_idx = None 

3244 

3245 if count: 

3246 _check_discrete_args(at, method) 

3247 count_idx, count = _get_count_index(exog, const_idx) 

3248 else: 

3249 count_idx = None 

3250 

3251 # get the exogenous variables 

3252 exog = _get_margeff_exog(exog, at, atexog, effects_idx) 

3253 

3254 # get base marginal effects, handled by sub-classes 

3255 effects = model._derivative_exog(params, exog, method, 

3256 dummy_idx, count_idx) 

3257 effects = _effects_at(effects, at) 

3258 

3259 if at == 'all': 

3260 self.margeff = effects[:, effects_idx] 

3261 else: 

3262 # Set standard error of the marginal effects by Delta method. 

3263 margeff_cov, margeff_se = margeff_cov_with_se( 

3264 model, params, exog, results.cov_params(), at, 

3265 model._derivative_exog, dummy_idx, count_idx, 

3266 method, 1) 

3267 

3268 # do not care about at constant 

3269 self.margeff_cov = margeff_cov[effects_idx][:, effects_idx] 

3270 self.margeff_se = margeff_se[effects_idx] 

3271 self.margeff = effects[effects_idx]