Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# This file is part of Patsy 

2# Copyright (C) 2011-2012 Nathaniel Smith <njs@pobox.com> 

3# See file LICENSE.txt for license information. 

4 

5# http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm 

6# http://www.ats.ucla.edu/stat/sas/webbooks/reg/chapter5/sasreg5.htm 

7 

8from __future__ import print_function 

9 

10# These are made available in the patsy.* namespace 

11__all__ = ["ContrastMatrix", "Treatment", "Poly", "Sum", "Helmert", "Diff"] 

12 

13import sys 

14import six 

15import numpy as np 

16from patsy import PatsyError 

17from patsy.util import (repr_pretty_delegate, repr_pretty_impl, 

18 safe_issubdtype, 

19 no_pickling, assert_no_pickling) 

20 

21class ContrastMatrix(object): 

22 """A simple container for a matrix used for coding categorical factors. 

23 

24 Attributes: 

25 

26 .. attribute:: matrix 

27 

28 A 2d ndarray, where each column corresponds to one column of the 

29 resulting design matrix, and each row contains the entries for a single 

30 categorical variable level. Usually n-by-n for a full rank coding or 

31 n-by-(n-1) for a reduced rank coding, though other options are 

32 possible. 

33 

34 .. attribute:: column_suffixes 

35 

36 A list of strings to be appended to the factor name, to produce the 

37 final column names. E.g. for treatment coding the entries will look 

38 like ``"[T.level1]"``. 

39 """ 

40 def __init__(self, matrix, column_suffixes): 

41 self.matrix = np.asarray(matrix) 

42 self.column_suffixes = column_suffixes 

43 if self.matrix.shape[1] != len(column_suffixes): 

44 raise PatsyError("matrix and column_suffixes don't conform") 

45 

46 __repr__ = repr_pretty_delegate 

47 def _repr_pretty_(self, p, cycle): 

48 repr_pretty_impl(p, self, [self.matrix, self.column_suffixes]) 

49 

50 __getstate__ = no_pickling 

51 

52def test_ContrastMatrix(): 

53 cm = ContrastMatrix([[1, 0], [0, 1]], ["a", "b"]) 

54 assert np.array_equal(cm.matrix, np.eye(2)) 

55 assert cm.column_suffixes == ["a", "b"] 

56 # smoke test 

57 repr(cm) 

58 

59 from nose.tools import assert_raises 

60 assert_raises(PatsyError, ContrastMatrix, [[1], [0]], ["a", "b"]) 

61 

62 assert_no_pickling(cm) 

63 

64# This always produces an object of the type that Python calls 'str' (whether 

65# that be a Python 2 string-of-bytes or a Python 3 string-of-unicode). It does 

66# *not* make any particular guarantees about being reversible or having other 

67# such useful programmatic properties -- it just produces something that will 

68# be nice for users to look at. 

69def _obj_to_readable_str(obj): 

70 if isinstance(obj, str): 

71 return obj 

72 elif sys.version_info >= (3,) and isinstance(obj, bytes): 

73 try: 

74 return obj.decode("utf-8") 

75 except UnicodeDecodeError: 

76 return repr(obj) 

77 elif sys.version_info < (3,) and isinstance(obj, unicode): 

78 try: 

79 return obj.encode("ascii") 

80 except UnicodeEncodeError: 

81 return repr(obj) 

82 else: 

83 return repr(obj) 

84 

85def test__obj_to_readable_str(): 

86 def t(obj, expected): 

87 got = _obj_to_readable_str(obj) 

88 assert type(got) is str 

89 assert got == expected 

90 t(1, "1") 

91 t(1.0, "1.0") 

92 t("asdf", "asdf") 

93 t(six.u("asdf"), "asdf") 

94 if sys.version_info >= (3,): 

95 # we can use "foo".encode here b/c this is python 3! 

96 # a utf-8 encoded euro-sign comes out as a real euro sign. 

97 t("\u20ac".encode("utf-8"), six.u("\u20ac")) 

98 # but a iso-8859-15 euro sign can't be decoded, and we fall back on 

99 # repr() 

100 t("\u20ac".encode("iso-8859-15"), "b'\\xa4'") 

101 else: 

102 t(six.u("\u20ac"), "u'\\u20ac'") 

103 

104def _name_levels(prefix, levels): 

105 return ["[%s%s]" % (prefix, _obj_to_readable_str(level)) for level in levels] 

106 

107def test__name_levels(): 

108 assert _name_levels("a", ["b", "c"]) == ["[ab]", "[ac]"] 

109 

110def _dummy_code(levels): 

111 return ContrastMatrix(np.eye(len(levels)), _name_levels("", levels)) 

112 

113def _get_level(levels, level_ref): 

114 if level_ref in levels: 

115 return levels.index(level_ref) 

116 if isinstance(level_ref, six.integer_types): 

117 if level_ref < 0: 

118 level_ref += len(levels) 

119 if not (0 <= level_ref < len(levels)): 

120 raise PatsyError("specified level %r is out of range" 

121 % (level_ref,)) 

122 return level_ref 

123 raise PatsyError("specified level %r not found" % (level_ref,)) 

124 

125def test__get_level(): 

126 assert _get_level(["a", "b", "c"], 0) == 0 

127 assert _get_level(["a", "b", "c"], -1) == 2 

128 assert _get_level(["a", "b", "c"], "b") == 1 

129 # For integer levels, we check identity before treating it as an index 

130 assert _get_level([2, 1, 0], 0) == 2 

131 from nose.tools import assert_raises 

132 assert_raises(PatsyError, _get_level, ["a", "b"], 2) 

133 assert_raises(PatsyError, _get_level, ["a", "b"], -3) 

134 assert_raises(PatsyError, _get_level, ["a", "b"], "c") 

135 

136 if not six.PY3: 

137 assert _get_level(["a", "b", "c"], long(0)) == 0 

138 assert _get_level(["a", "b", "c"], long(-1)) == 2 

139 assert _get_level([2, 1, 0], long(0)) == 2 

140 

141 

142class Treatment(object): 

143 """Treatment coding (also known as dummy coding). 

144 

145 This is the default coding. 

146 

147 For reduced-rank coding, one level is chosen as the "reference", and its 

148 mean behaviour is represented by the intercept. Each column of the 

149 resulting matrix represents the difference between the mean of one level 

150 and this reference level. 

151 

152 For full-rank coding, classic "dummy" coding is used, and each column of 

153 the resulting matrix represents the mean of the corresponding level. 

154 

155 The reference level defaults to the first level, or can be specified 

156 explicitly. 

157 

158 .. ipython:: python 

159 

160 # reduced rank 

161 dmatrix("C(a, Treatment)", balanced(a=3)) 

162 # full rank 

163 dmatrix("0 + C(a, Treatment)", balanced(a=3)) 

164 # Setting a reference level 

165 dmatrix("C(a, Treatment(1))", balanced(a=3)) 

166 dmatrix("C(a, Treatment('a2'))", balanced(a=3)) 

167 

168 Equivalent to R ``contr.treatment``. The R documentation suggests that 

169 using ``Treatment(reference=-1)`` will produce contrasts that are 

170 "equivalent to those produced by many (but not all) SAS procedures". 

171 """ 

172 def __init__(self, reference=None): 

173 self.reference = reference 

174 

175 def code_with_intercept(self, levels): 

176 return _dummy_code(levels) 

177 

178 def code_without_intercept(self, levels): 

179 if self.reference is None: 

180 reference = 0 

181 else: 

182 reference = _get_level(levels, self.reference) 

183 eye = np.eye(len(levels) - 1) 

184 contrasts = np.vstack((eye[:reference, :], 

185 np.zeros((1, len(levels) - 1)), 

186 eye[reference:, :])) 

187 names = _name_levels("T.", levels[:reference] + levels[reference + 1:]) 

188 return ContrastMatrix(contrasts, names) 

189 

190 __getstate__ = no_pickling 

191 

192def test_Treatment(): 

193 t1 = Treatment() 

194 matrix = t1.code_with_intercept(["a", "b", "c"]) 

195 assert matrix.column_suffixes == ["[a]", "[b]", "[c]"] 

196 assert np.allclose(matrix.matrix, [[1, 0, 0], [0, 1, 0], [0, 0, 1]]) 

197 matrix = t1.code_without_intercept(["a", "b", "c"]) 

198 assert matrix.column_suffixes == ["[T.b]", "[T.c]"] 

199 assert np.allclose(matrix.matrix, [[0, 0], [1, 0], [0, 1]]) 

200 matrix = Treatment(reference=1).code_without_intercept(["a", "b", "c"]) 

201 assert matrix.column_suffixes == ["[T.a]", "[T.c]"] 

202 assert np.allclose(matrix.matrix, [[1, 0], [0, 0], [0, 1]]) 

203 matrix = Treatment(reference=-2).code_without_intercept(["a", "b", "c"]) 

204 assert matrix.column_suffixes == ["[T.a]", "[T.c]"] 

205 assert np.allclose(matrix.matrix, [[1, 0], [0, 0], [0, 1]]) 

206 matrix = Treatment(reference="b").code_without_intercept(["a", "b", "c"]) 

207 assert matrix.column_suffixes == ["[T.a]", "[T.c]"] 

208 assert np.allclose(matrix.matrix, [[1, 0], [0, 0], [0, 1]]) 

209 # Make sure the default is always the first level, even if there is a 

210 # different level called 0. 

211 matrix = Treatment().code_without_intercept([2, 1, 0]) 

212 assert matrix.column_suffixes == ["[T.1]", "[T.0]"] 

213 assert np.allclose(matrix.matrix, [[0, 0], [1, 0], [0, 1]]) 

214 

215class Poly(object): 

216 """Orthogonal polynomial contrast coding. 

217 

218 This coding scheme treats the levels as ordered samples from an underlying 

219 continuous scale, whose effect takes an unknown functional form which is 

220 `Taylor-decomposed`__ into the sum of a linear, quadratic, etc. components. 

221 

222 .. __: https://en.wikipedia.org/wiki/Taylor_series 

223 

224 For reduced-rank coding, you get a linear column, a quadratic column, 

225 etc., up to the number of levels provided. 

226 

227 For full-rank coding, the same scheme is used, except that the zero-order 

228 constant polynomial is also included. I.e., you get an intercept column 

229 included as part of your categorical term. 

230 

231 By default the levels are treated as equally spaced, but you can override 

232 this by providing a value for the `scores` argument. 

233 

234 Examples: 

235 

236 .. ipython:: python 

237 

238 # Reduced rank 

239 dmatrix("C(a, Poly)", balanced(a=4)) 

240 # Full rank 

241 dmatrix("0 + C(a, Poly)", balanced(a=3)) 

242 # Explicit scores 

243 dmatrix("C(a, Poly([1, 2, 10]))", balanced(a=3)) 

244 

245 This is equivalent to R's ``contr.poly``. (But note that in R, reduced 

246 rank encodings are always dummy-coded, regardless of what contrast you 

247 have set.) 

248 """ 

249 def __init__(self, scores=None): 

250 self.scores = scores 

251 

252 def _code_either(self, intercept, levels): 

253 n = len(levels) 

254 scores = self.scores 

255 if scores is None: 

256 scores = np.arange(n) 

257 scores = np.asarray(scores, dtype=float) 

258 if len(scores) != n: 

259 raise PatsyError("number of levels (%s) does not match" 

260 " number of scores (%s)" 

261 % (n, len(scores))) 

262 # Strategy: just make a matrix whose columns are naive linear, 

263 # quadratic, etc., functions of the raw scores, and then use 'qr' to 

264 # orthogonalize each column against those to its left. 

265 scores -= scores.mean() 

266 raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1)) 

267 q, r = np.linalg.qr(raw_poly) 

268 q *= np.sign(np.diag(r)) 

269 q /= np.sqrt(np.sum(q ** 2, axis=1)) 

270 # The constant term is always all 1's -- we don't normalize it. 

271 q[:, 0] = 1 

272 names = [".Constant", ".Linear", ".Quadratic", ".Cubic"] 

273 names += ["^%s" % (i,) for i in range(4, n)] 

274 names = names[:n] 

275 if intercept: 

276 return ContrastMatrix(q, names) 

277 else: 

278 # We always include the constant/intercept column as something to 

279 # orthogonalize against, but we don't always return it: 

280 return ContrastMatrix(q[:, 1:], names[1:]) 

281 

282 def code_with_intercept(self, levels): 

283 return self._code_either(True, levels) 

284 

285 def code_without_intercept(self, levels): 

286 return self._code_either(False, levels) 

287 

288 __getstate__ = no_pickling 

289 

290def test_Poly(): 

291 t1 = Poly() 

292 matrix = t1.code_with_intercept(["a", "b", "c"]) 

293 assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic"] 

294 # Values from R 'options(digits=15); contr.poly(3)' 

295 expected = [[1, -7.07106781186548e-01, 0.408248290463863], 

296 [1, 0, -0.816496580927726], 

297 [1, 7.07106781186547e-01, 0.408248290463863]] 

298 print(matrix.matrix) 

299 assert np.allclose(matrix.matrix, expected) 

300 matrix = t1.code_without_intercept(["a", "b", "c"]) 

301 assert matrix.column_suffixes == [".Linear", ".Quadratic"] 

302 # Values from R 'options(digits=15); contr.poly(3)' 

303 print(matrix.matrix) 

304 assert np.allclose(matrix.matrix, 

305 [[-7.07106781186548e-01, 0.408248290463863], 

306 [0, -0.816496580927726], 

307 [7.07106781186547e-01, 0.408248290463863]]) 

308 

309 matrix = Poly(scores=[0, 10, 11]).code_with_intercept(["a", "b", "c"]) 

310 assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic"] 

311 # Values from R 'options(digits=15); contr.poly(3, scores=c(0, 10, 11))' 

312 print(matrix.matrix) 

313 assert np.allclose(matrix.matrix, 

314 [[1, -0.813733471206735, 0.0671156055214024], 

315 [1, 0.348742916231458, -0.7382716607354268], 

316 [1, 0.464990554975277, 0.6711560552140243]]) 

317 

318 # we had an integer/float handling bug for score vectors whose mean was 

319 # non-integer, so check one of those: 

320 matrix = Poly(scores=[0, 10, 12]).code_with_intercept(["a", "b", "c"]) 

321 assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic"] 

322 # Values from R 'options(digits=15); contr.poly(3, scores=c(0, 10, 12))' 

323 print(matrix.matrix) 

324 assert np.allclose(matrix.matrix, 

325 [[1, -0.806559132617443, 0.127000127000191], 

326 [1, 0.293294230042706, -0.762000762001143], 

327 [1, 0.513264902574736, 0.635000635000952]]) 

328 

329 from nose.tools import assert_raises 

330 assert_raises(PatsyError, 

331 Poly(scores=[0, 1]).code_with_intercept, 

332 ["a", "b", "c"]) 

333 

334 matrix = t1.code_with_intercept(list(range(6))) 

335 assert matrix.column_suffixes == [".Constant", ".Linear", ".Quadratic", 

336 ".Cubic", "^4", "^5"] 

337 

338 

339class Sum(object): 

340 """Deviation coding (also known as sum-to-zero coding). 

341 

342 Compares the mean of each level to the mean-of-means. (In a balanced 

343 design, compares the mean of each level to the overall mean.) 

344 

345 For full-rank coding, a standard intercept term is added. 

346 

347 One level must be omitted to avoid redundancy; by default this is the last 

348 level, but this can be adjusted via the `omit` argument. 

349 

350 .. warning:: There are multiple definitions of 'deviation coding' in 

351 use. Make sure this is the one you expect before trying to interpret 

352 your results! 

353 

354 Examples: 

355 

356 .. ipython:: python 

357 

358 # Reduced rank 

359 dmatrix("C(a, Sum)", balanced(a=4)) 

360 # Full rank 

361 dmatrix("0 + C(a, Sum)", balanced(a=4)) 

362 # Omit a different level 

363 dmatrix("C(a, Sum(1))", balanced(a=3)) 

364 dmatrix("C(a, Sum('a1'))", balanced(a=3)) 

365 

366 This is equivalent to R's `contr.sum`. 

367 """ 

368 def __init__(self, omit=None): 

369 self.omit = omit 

370 

371 def _omit_i(self, levels): 

372 if self.omit is None: 

373 # We assume below that this is positive 

374 return len(levels) - 1 

375 else: 

376 return _get_level(levels, self.omit) 

377 

378 def _sum_contrast(self, levels): 

379 n = len(levels) 

380 omit_i = self._omit_i(levels) 

381 eye = np.eye(n - 1) 

382 out = np.empty((n, n - 1)) 

383 out[:omit_i, :] = eye[:omit_i, :] 

384 out[omit_i, :] = -1 

385 out[omit_i + 1:, :] = eye[omit_i:, :] 

386 return out 

387 

388 def code_with_intercept(self, levels): 

389 contrast = self.code_without_intercept(levels) 

390 matrix = np.column_stack((np.ones(len(levels)), 

391 contrast.matrix)) 

392 column_suffixes = ["[mean]"] + contrast.column_suffixes 

393 return ContrastMatrix(matrix, column_suffixes) 

394 

395 def code_without_intercept(self, levels): 

396 matrix = self._sum_contrast(levels) 

397 omit_i = self._omit_i(levels) 

398 included_levels = levels[:omit_i] + levels[omit_i + 1:] 

399 return ContrastMatrix(matrix, _name_levels("S.", included_levels)) 

400 

401 __getstate__ = no_pickling 

402 

403def test_Sum(): 

404 t1 = Sum() 

405 matrix = t1.code_with_intercept(["a", "b", "c"]) 

406 assert matrix.column_suffixes == ["[mean]", "[S.a]", "[S.b]"] 

407 assert np.allclose(matrix.matrix, [[1, 1, 0], [1, 0, 1], [1, -1, -1]]) 

408 matrix = t1.code_without_intercept(["a", "b", "c"]) 

409 assert matrix.column_suffixes == ["[S.a]", "[S.b]"] 

410 assert np.allclose(matrix.matrix, [[1, 0], [0, 1], [-1, -1]]) 

411 # Check that it's not thrown off by negative integer term names 

412 matrix = t1.code_without_intercept([-1, -2, -3]) 

413 assert matrix.column_suffixes == ["[S.-1]", "[S.-2]"] 

414 assert np.allclose(matrix.matrix, [[1, 0], [0, 1], [-1, -1]]) 

415 t2 = Sum(omit=1) 

416 matrix = t2.code_with_intercept(["a", "b", "c"]) 

417 assert matrix.column_suffixes == ["[mean]", "[S.a]", "[S.c]"] 

418 assert np.allclose(matrix.matrix, [[1, 1, 0], [1, -1, -1], [1, 0, 1]]) 

419 matrix = t2.code_without_intercept(["a", "b", "c"]) 

420 assert matrix.column_suffixes == ["[S.a]", "[S.c]"] 

421 assert np.allclose(matrix.matrix, [[1, 0], [-1, -1], [0, 1]]) 

422 matrix = t2.code_without_intercept([1, 0, 2]) 

423 assert matrix.column_suffixes == ["[S.0]", "[S.2]"] 

424 assert np.allclose(matrix.matrix, [[-1, -1], [1, 0], [0, 1]]) 

425 t3 = Sum(omit=-3) 

426 matrix = t3.code_with_intercept(["a", "b", "c"]) 

427 assert matrix.column_suffixes == ["[mean]", "[S.b]", "[S.c]"] 

428 assert np.allclose(matrix.matrix, [[1, -1, -1], [1, 1, 0], [1, 0, 1]]) 

429 matrix = t3.code_without_intercept(["a", "b", "c"]) 

430 assert matrix.column_suffixes == ["[S.b]", "[S.c]"] 

431 assert np.allclose(matrix.matrix, [[-1, -1], [1, 0], [0, 1]]) 

432 t4 = Sum(omit="a") 

433 matrix = t3.code_with_intercept(["a", "b", "c"]) 

434 assert matrix.column_suffixes == ["[mean]", "[S.b]", "[S.c]"] 

435 assert np.allclose(matrix.matrix, [[1, -1, -1], [1, 1, 0], [1, 0, 1]]) 

436 matrix = t3.code_without_intercept(["a", "b", "c"]) 

437 assert matrix.column_suffixes == ["[S.b]", "[S.c]"] 

438 assert np.allclose(matrix.matrix, [[-1, -1], [1, 0], [0, 1]]) 

439 

440class Helmert(object): 

441 """Helmert contrasts. 

442 

443 Compares the second level with the first, the third with the average of 

444 the first two, and so on. 

445 

446 For full-rank coding, a standard intercept term is added. 

447 

448 .. warning:: There are multiple definitions of 'Helmert coding' in 

449 use. Make sure this is the one you expect before trying to interpret 

450 your results! 

451 

452 Examples: 

453 

454 .. ipython:: python 

455 

456 # Reduced rank 

457 dmatrix("C(a, Helmert)", balanced(a=4)) 

458 # Full rank 

459 dmatrix("0 + C(a, Helmert)", balanced(a=4)) 

460 

461 This is equivalent to R's `contr.helmert`. 

462 """ 

463 def _helmert_contrast(self, levels): 

464 n = len(levels) 

465 #http://www.ats.ucla.edu/stat/sas/webbooks/reg/chapter5/sasreg5.htm#HELMERT 

466 #contr = np.eye(n - 1) 

467 #int_range = np.arange(n - 1., 1, -1) 

468 #denom = np.repeat(int_range, np.arange(n - 2, 0, -1)) 

469 #contr[np.tril_indices(n - 1, -1)] = -1. / denom 

470 

471 #http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#HELMERT 

472 #contr = np.zeros((n - 1., n - 1)) 

473 #int_range = np.arange(n, 1, -1) 

474 #denom = np.repeat(int_range[:-1], np.arange(n - 2, 0, -1)) 

475 #contr[np.diag_indices(n - 1)] = (int_range - 1.) / int_range 

476 #contr[np.tril_indices(n - 1, -1)] = -1. / denom 

477 #contr = np.vstack((contr, -1./int_range)) 

478 

479 #r-like 

480 contr = np.zeros((n, n - 1)) 

481 contr[1:][np.diag_indices(n - 1)] = np.arange(1, n) 

482 contr[np.triu_indices(n - 1)] = -1 

483 return contr 

484 

485 def code_with_intercept(self, levels): 

486 contrast = np.column_stack((np.ones(len(levels)), 

487 self._helmert_contrast(levels))) 

488 column_suffixes = _name_levels("H.", ["intercept"] + list(levels[1:])) 

489 return ContrastMatrix(contrast, column_suffixes) 

490 

491 def code_without_intercept(self, levels): 

492 contrast = self._helmert_contrast(levels) 

493 return ContrastMatrix(contrast, 

494 _name_levels("H.", levels[1:])) 

495 

496 __getstate__ = no_pickling 

497 

498def test_Helmert(): 

499 t1 = Helmert() 

500 for levels in (["a", "b", "c", "d"], ("a", "b", "c", "d")): 

501 matrix = t1.code_with_intercept(levels) 

502 assert matrix.column_suffixes == ["[H.intercept]", 

503 "[H.b]", 

504 "[H.c]", 

505 "[H.d]"] 

506 assert np.allclose(matrix.matrix, [[1, -1, -1, -1], 

507 [1, 1, -1, -1], 

508 [1, 0, 2, -1], 

509 [1, 0, 0, 3]]) 

510 matrix = t1.code_without_intercept(levels) 

511 assert matrix.column_suffixes == ["[H.b]", "[H.c]", "[H.d]"] 

512 assert np.allclose(matrix.matrix, [[-1, -1, -1], 

513 [1, -1, -1], 

514 [0, 2, -1], 

515 [0, 0, 3]]) 

516 

517class Diff(object): 

518 """Backward difference coding. 

519 

520 This coding scheme is useful for ordered factors, and compares the mean of 

521 each level with the preceding level. So you get the second level minus the 

522 first, the third level minus the second, etc. 

523 

524 For full-rank coding, a standard intercept term is added (which gives the 

525 mean value for the first level). 

526 

527 Examples: 

528 

529 .. ipython:: python 

530 

531 # Reduced rank 

532 dmatrix("C(a, Diff)", balanced(a=3)) 

533 # Full rank 

534 dmatrix("0 + C(a, Diff)", balanced(a=3)) 

535 """ 

536 def _diff_contrast(self, levels): 

537 nlevels = len(levels) 

538 contr = np.zeros((nlevels, nlevels-1)) 

539 int_range = np.arange(1, nlevels) 

540 upper_int = np.repeat(int_range, int_range) 

541 row_i, col_i = np.triu_indices(nlevels-1) 

542 # we want to iterate down the columns not across the rows 

543 # it would be nice if the index functions had a row/col order arg 

544 col_order = np.argsort(col_i) 

545 contr[row_i[col_order], 

546 col_i[col_order]] = (upper_int-nlevels)/float(nlevels) 

547 lower_int = np.repeat(int_range, int_range[::-1]) 

548 row_i, col_i = np.tril_indices(nlevels-1) 

549 # we want to iterate down the columns not across the rows 

550 col_order = np.argsort(col_i) 

551 contr[row_i[col_order]+1, col_i[col_order]] = lower_int/float(nlevels) 

552 return contr 

553 

554 def code_with_intercept(self, levels): 

555 contrast = np.column_stack((np.ones(len(levels)), 

556 self._diff_contrast(levels))) 

557 return ContrastMatrix(contrast, _name_levels("D.", levels)) 

558 

559 def code_without_intercept(self, levels): 

560 contrast = self._diff_contrast(levels) 

561 return ContrastMatrix(contrast, _name_levels("D.", levels[:-1])) 

562 

563 __getstate__ = no_pickling 

564 

565def test_diff(): 

566 t1 = Diff() 

567 matrix = t1.code_with_intercept(["a", "b", "c", "d"]) 

568 assert matrix.column_suffixes == ["[D.a]", "[D.b]", "[D.c]", 

569 "[D.d]"] 

570 assert np.allclose(matrix.matrix, [[1, -3/4., -1/2., -1/4.], 

571 [1, 1/4., -1/2., -1/4.], 

572 [1, 1/4., 1./2, -1/4.], 

573 [1, 1/4., 1/2., 3/4.]]) 

574 matrix = t1.code_without_intercept(["a", "b", "c", "d"]) 

575 assert matrix.column_suffixes == ["[D.a]", "[D.b]", "[D.c]"] 

576 assert np.allclose(matrix.matrix, [[-3/4., -1/2., -1/4.], 

577 [1/4., -1/2., -1/4.], 

578 [1/4., 2./4, -1/4.], 

579 [1/4., 1/2., 3/4.]]) 

580 

581# contrast can be: 

582# -- a ContrastMatrix 

583# -- a simple np.ndarray 

584# -- an object with code_with_intercept and code_without_intercept methods 

585# -- a function returning one of the above 

586# -- None, in which case the above rules are applied to 'default' 

587# This function always returns a ContrastMatrix. 

588def code_contrast_matrix(intercept, levels, contrast, default=None): 

589 if contrast is None: 

590 contrast = default 

591 if callable(contrast): 

592 contrast = contrast() 

593 if isinstance(contrast, ContrastMatrix): 

594 return contrast 

595 as_array = np.asarray(contrast) 

596 if safe_issubdtype(as_array.dtype, np.number): 

597 return ContrastMatrix(as_array, 

598 _name_levels("custom", range(as_array.shape[1]))) 

599 if intercept: 

600 return contrast.code_with_intercept(levels) 

601 else: 

602 return contrast.code_without_intercept(levels) 

603