Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2Implements Lilliefors corrected Kolmogorov-Smirnov tests for normal and 

3exponential distributions. 

4 

5`kstest_fit` is provided as a top-level function to access both tests. 

6`kstest_normal` and `kstest_exponential` are provided as convenience functions 

7with the appropriate test as the default. 

8`lilliefors` is provided as an alias for `kstest_fit`. 

9 

10Created on Sat Oct 01 13:16:49 2011 

11 

12Author: Josef Perktold 

13License: BSD-3 

14 

15pvalues for Lilliefors test are based on formula and table in 

16 

17An Analytic Approximation to the Distribution of Lilliefors's Test Statistic 

18for Normality 

19Author(s): Gerard E. Dallal and Leland WilkinsonSource: The American 

20Statistician, Vol. 40, No. 4 (Nov., 1986), pp. 294-296 

21Published by: American Statistical Association 

22Stable URL: http://www.jstor.org/stable/2684607 . 

23 

24On the Kolmogorov-Smirnov Test for Normality with Mean and Variance Unknown 

25Hubert W. Lilliefors 

26Journal of the American Statistical Association, Vol. 62, No. 318. 

27(Jun., 1967), pp. 399-402. 

28 

29--- 

30 

31Updated 2017-07-23 

32Jacob C. Kimmel 

33 

34Ref: 

35Lilliefors, H.W. 

36On the Kolmogorov-Smirnov test for the exponential distribution with mean 

37unknown. Journal of the American Statistical Association, Vol 64, No. 325. 

38(1969), pp. 387–389. 

39""" 

40from functools import partial 

41 

42import numpy as np 

43from scipy import stats 

44 

45from ._lilliefors_critical_values import (critical_values, 

46 asymp_critical_values, 

47 PERCENTILES) 

48from .tabledist import TableDist 

49 

50 

51def _make_asymptotic_function(params): 

52 """ 

53 Generates an asymptotic distribution callable from a param matrix 

54 

55 Polynomial is a[0] * x**(-1/2) + a[1] * x**(-1) + a[2] * x**(-3/2) 

56 

57 Parameters 

58 ---------- 

59 params : ndarray 

60 Array with shape (nalpha, 3) where nalpha is the number of 

61 significance levels 

62 """ 

63 

64 def f(n): 

65 poly = np.array([1, np.log(n), np.log(n) ** 2]) 

66 return np.exp(poly.dot(params.T)) 

67 

68 return f 

69 

70 

71def ksstat(x, cdf, alternative='two_sided', args=()): 

72 """ 

73 Calculate statistic for the Kolmogorov-Smirnov test for goodness of fit 

74 

75 This calculates the test statistic for a test of the distribution G(x) of 

76 an observed variable against a given distribution F(x). Under the null 

77 hypothesis the two distributions are identical, G(x)=F(x). The 

78 alternative hypothesis can be either 'two_sided' (default), 'less' 

79 or 'greater'. The KS test is only valid for continuous distributions. 

80 

81 Parameters 

82 ---------- 

83 x : array_like, 1d 

84 array of observations 

85 cdf : str or callable 

86 string: name of a distribution in scipy.stats 

87 callable: function to evaluate cdf 

88 alternative : 'two_sided' (default), 'less' or 'greater' 

89 defines the alternative hypothesis (see explanation) 

90 args : tuple, sequence 

91 distribution parameters for call to cdf 

92 

93 

94 Returns 

95 ------- 

96 D : float 

97 KS test statistic, either D, D+ or D- 

98 

99 See Also 

100 -------- 

101 scipy.stats.kstest 

102 

103 Notes 

104 ----- 

105 

106 In the one-sided test, the alternative is that the empirical 

107 cumulative distribution function of the random variable is "less" 

108 or "greater" than the cumulative distribution function F(x) of the 

109 hypothesis, G(x)<=F(x), resp. G(x)>=F(x). 

110 

111 In contrast to scipy.stats.kstest, this function only calculates the 

112 statistic which can be used either as distance measure or to implement 

113 case specific p-values. 

114 """ 

115 nobs = float(len(x)) 

116 

117 if isinstance(cdf, str): 

118 cdf = getattr(stats.distributions, cdf).cdf 

119 elif hasattr(cdf, 'cdf'): 

120 cdf = getattr(cdf, 'cdf') 

121 

122 x = np.sort(x) 

123 cdfvals = cdf(x, *args) 

124 

125 d_plus = (np.arange(1.0, nobs + 1) / nobs - cdfvals).max() 

126 d_min = (cdfvals - np.arange(0.0, nobs) / nobs).max() 

127 if alternative == 'greater': 

128 return d_plus 

129 elif alternative == 'less': 

130 return d_min 

131 

132 return np.max([d_plus, d_min]) 

133 

134 

135def get_lilliefors_table(dist='norm'): 

136 """ 

137 Generates tables for significance levels of Lilliefors test statistics 

138 

139 Tables for available normal and exponential distribution testing, 

140 as specified in Lilliefors references above 

141 

142 Parameters 

143 ---------- 

144 dist : str 

145 distribution being tested in set {'norm', 'exp'}. 

146 

147 Returns 

148 ------- 

149 lf : TableDist object. 

150 table of critical values 

151 """ 

152 # function just to keep things together 

153 # for this test alpha is sf probability, i.e. right tail probability 

154 

155 alpha = 1 - np.array(PERCENTILES) / 100.0 

156 alpha = alpha[::-1] 

157 dist = 'normal' if dist == 'norm' else dist 

158 if dist not in critical_values: 

159 raise ValueError("Invalid dist parameter. Must be 'norm' or 'exp'") 

160 cv_data = critical_values[dist] 

161 acv_data = asymp_critical_values[dist] 

162 

163 size = np.array(sorted(cv_data), dtype=np.float) 

164 crit_lf = np.array([cv_data[key] for key in sorted(cv_data)]) 

165 crit_lf = crit_lf[:, ::-1] 

166 

167 asym_params = np.array([acv_data[key] for key in sorted(acv_data)]) 

168 asymp_fn = _make_asymptotic_function((asym_params[::-1])) 

169 

170 lf = TableDist(alpha, size, crit_lf, asymptotic=asymp_fn) 

171 return lf 

172 

173 

174lilliefors_table_norm = get_lilliefors_table(dist='norm') 

175lilliefors_table_expon = get_lilliefors_table(dist='exp') 

176 

177 

178def pval_lf(d_max, n): 

179 """ 

180 Approximate pvalues for Lilliefors test 

181 

182 This is only valid for pvalues smaller than 0.1 which is not checked in 

183 this function. 

184 

185 Parameters 

186 ---------- 

187 d_max : array_like 

188 two-sided Kolmogorov-Smirnov test statistic 

189 n : int or float 

190 sample size 

191 

192 Returns 

193 ------- 

194 p-value : float or ndarray 

195 pvalue according to approximation formula of Dallal and Wilkinson. 

196 

197 Notes 

198 ----- 

199 This is mainly a helper function where the calling code should dispatch 

200 on bound violations. Therefore it does not check whether the pvalue is in 

201 the valid range. 

202 

203 Precision for the pvalues is around 2 to 3 decimals. This approximation is 

204 also used by other statistical packages (e.g. R:fBasics) but might not be 

205 the most precise available. 

206 

207 References 

208 ---------- 

209 DallalWilkinson1986 

210 """ 

211 # todo: check boundaries, valid range for n and Dmax 

212 if n > 100: 

213 d_max *= (n / 100.) ** 0.49 

214 n = 100 

215 pval = np.exp(-7.01256 * d_max ** 2 * (n + 2.78019) 

216 + 2.99587 * d_max * np.sqrt(n + 2.78019) - 0.122119 

217 + 0.974598 / np.sqrt(n) + 1.67997 / n) 

218 return pval 

219 

220 

221def kstest_fit(x, dist='norm', pvalmethod=None): 

222 """ 

223 Test assumed normal or exponential distribution using Lilliefors' test. 

224 

225 Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters. 

226 

227 Parameters 

228 ---------- 

229 x : array_like, 1d 

230 Data to test. 

231 dist : {'norm', 'exp'}, optional 

232 The assumed distribution. 

233 pvalmethod : {'approx', 'table'}, optional 

234 The method used to compute the p-value of the test statistic. In 

235 general, 'table' is preferred and makes use of a very large simulation. 

236 'approx' is only valid for normality. if `dist = 'exp'` `table` is 

237 always used. 'approx' uses the approximation formula of Dalal and 

238 Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1, 

239 then the result of `table` is returned. 

240 

241 Returns 

242 ------- 

243 ksstat : float 

244 Kolmogorov-Smirnov test statistic with estimated mean and variance. 

245 pvalue : float 

246 If the pvalue is lower than some threshold, e.g. 0.05, then we can 

247 reject the Null hypothesis that the sample comes from a normal 

248 distribution. 

249 

250 Notes 

251 ----- 

252 'table' uses an improved table based on 10,000,000 simulations. The 

253 critical values are approximated using 

254 log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2 

255 where cv_alpha is the critical value for a test with size alpha, 

256 b_alpha is an alpha-specific intercept term and c[1] and c[2] are 

257 coefficients that are shared all alphas. 

258 Values in the table are linearly interpolated. Values outside the 

259 range are be returned as bounds, 0.990 for large and 0.001 for small 

260 pvalues. 

261 

262 For implementation details, see lilliefors_critical_value_simulation.py in 

263 the test directory. 

264 """ 

265 if pvalmethod is None: 

266 pvalmethod = 'approx' 

267 import warnings 

268 msg = 'The default pvalmethod will change from "approx" to "table" ' \ 

269 'after 0.11. The "table" method uses values from a very large ' \ 

270 'simulation and is more accurate. Explicitly set this ' \ 

271 'parameter to "approx" or "table" to silence this warning' 

272 warnings.warn(msg, FutureWarning) 

273 

274 x = np.asarray(x) 

275 nobs = len(x) 

276 

277 if dist == 'norm': 

278 z = (x - x.mean()) / x.std(ddof=1) 

279 test_d = stats.norm.cdf 

280 lilliefors_table = lilliefors_table_norm 

281 elif dist == 'exp': 

282 z = x / x.mean() 

283 test_d = stats.expon.cdf 

284 lilliefors_table = lilliefors_table_expon 

285 pvalmethod = 'table' 

286 else: 

287 raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'") 

288 

289 min_nobs = 4 if dist == 'norm' else 3 

290 if nobs < min_nobs: 

291 raise ValueError('Test for distribution {0} requires at least {1} ' 

292 'observations'.format(dist, min_nobs)) 

293 

294 d_ks = ksstat(z, test_d, alternative='two_sided') 

295 

296 if pvalmethod == 'approx': 

297 pval = pval_lf(d_ks, nobs) 

298 # check pval is in desired range 

299 if pval > 0.1: 

300 pval = lilliefors_table.prob(d_ks, nobs) 

301 else: # pvalmethod == 'table' 

302 pval = lilliefors_table.prob(d_ks, nobs) 

303 

304 return d_ks, pval 

305 

306 

307lilliefors = kstest_fit 

308kstest_normal = kstest_fit 

309kstest_exponential = partial(kstest_fit, dist='exp')