Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/statsmodels/base/data.py : 16%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Base tools for handling various kinds of data structures, attaching metadata to
3results, and doing data cleaning
4"""
5from statsmodels.compat.python import iteritems, lmap
7from functools import reduce
9import numpy as np
10from pandas import DataFrame, Series, isnull, MultiIndex
12import statsmodels.tools.data as data_util
13from statsmodels.tools.decorators import cache_readonly, cache_writable
14from statsmodels.tools.sm_exceptions import MissingDataError
17def _asarray_2dcolumns(x):
18 if np.asarray(x).ndim > 1 and np.asarray(x).squeeze().ndim == 1:
19 return
22def _asarray_2d_null_rows(x):
23 """
24 Makes sure input is an array and is 2d. Makes sure output is 2d. True
25 indicates a null in the rows of 2d x.
26 """
27 #Have to have the asarrays because isnull does not account for array_like
28 #input
29 x = np.asarray(x)
30 if x.ndim == 1:
31 x = x[:, None]
32 return np.any(isnull(x), axis=1)[:, None]
35def _nan_rows(*arrs):
36 """
37 Returns a boolean array which is True where any of the rows in any
38 of the _2d_ arrays in arrs are NaNs. Inputs can be any mixture of Series,
39 DataFrames or array_like.
40 """
41 if len(arrs) == 1:
42 arrs += ([[False]],)
44 def _nan_row_maybe_two_inputs(x, y):
45 # check for dtype bc dataframe has dtypes
46 x_is_boolean_array = hasattr(x, 'dtype') and x.dtype == bool and x
47 return np.logical_or(_asarray_2d_null_rows(x),
48 (x_is_boolean_array | _asarray_2d_null_rows(y)))
49 return reduce(_nan_row_maybe_two_inputs, arrs).squeeze()
52class ModelData(object):
53 """
54 Class responsible for handling input data and extracting metadata into the
55 appropriate form
56 """
57 _param_names = None
58 _cov_names = None
60 def __init__(self, endog, exog=None, missing='none', hasconst=None,
61 **kwargs):
62 if data_util._is_recarray(endog) or data_util._is_recarray(exog):
63 import warnings
64 from statsmodels.tools.sm_exceptions import recarray_warning
65 warnings.warn(recarray_warning, FutureWarning)
66 if 'design_info' in kwargs:
67 self.design_info = kwargs.pop('design_info')
68 if 'formula' in kwargs:
69 self.formula = kwargs.pop('formula')
70 if missing != 'none':
71 arrays, nan_idx = self.handle_missing(endog, exog, missing,
72 **kwargs)
73 self.missing_row_idx = nan_idx
74 self.__dict__.update(arrays) # attach all the data arrays
75 self.orig_endog = self.endog
76 self.orig_exog = self.exog
77 self.endog, self.exog = self._convert_endog_exog(self.endog,
78 self.exog)
79 else:
80 self.__dict__.update(kwargs) # attach the extra arrays anyway
81 self.orig_endog = endog
82 self.orig_exog = exog
83 self.endog, self.exog = self._convert_endog_exog(endog, exog)
85 self.const_idx = None
86 self.k_constant = 0
87 self._handle_constant(hasconst)
88 self._check_integrity()
89 self._cache = {}
91 def __getstate__(self):
92 from copy import copy
93 d = copy(self.__dict__)
94 if "design_info" in d:
95 del d["design_info"]
96 d["restore_design_info"] = True
97 return d
99 def __setstate__(self, d):
100 if "restore_design_info" in d:
101 # NOTE: there may be a more performant way to do this
102 from patsy import dmatrices, PatsyError
103 exc = []
104 try:
105 data = d['frame']
106 except KeyError:
107 data = d['orig_endog'].join(d['orig_exog'])
109 for depth in [2, 3, 1, 0, 4]: # sequence is a guess where to likely find it
110 try:
111 _, design = dmatrices(d['formula'], data, eval_env=depth,
112 return_type='dataframe')
113 break
114 except (NameError, PatsyError) as e:
115 exc.append(e) # why do I need a reference from outside except block
116 pass
117 else:
118 raise exc[-1]
120 self.design_info = design.design_info
121 del d["restore_design_info"]
122 self.__dict__.update(d)
124 def _handle_constant(self, hasconst):
125 if hasconst is False or self.exog is None:
126 self.k_constant = 0
127 self.const_idx = None
128 else:
129 # detect where the constant is
130 check_implicit = False
131 exog_max = np.max(self.exog, axis=0)
132 if not np.isfinite(exog_max).all():
133 raise MissingDataError('exog contains inf or nans')
134 exog_min = np.min(self.exog, axis=0)
135 const_idx = np.where(exog_max == exog_min)[0].squeeze()
136 self.k_constant = const_idx.size
138 if self.k_constant == 1:
139 if self.exog[:, const_idx].mean() != 0:
140 self.const_idx = int(const_idx)
141 else:
142 # we only have a zero column and no other constant
143 check_implicit = True
144 elif self.k_constant > 1:
145 # we have more than one constant column
146 # look for ones
147 values = [] # keep values if we need != 0
148 for idx in const_idx:
149 value = self.exog[:, idx].mean()
150 if value == 1:
151 self.k_constant = 1
152 self.const_idx = int(idx)
153 break
154 values.append(value)
155 else:
156 # we did not break, no column of ones
157 pos = (np.array(values) != 0)
158 if pos.any():
159 # take the first nonzero column
160 self.k_constant = 1
161 self.const_idx = int(const_idx[pos.argmax()])
162 else:
163 # only zero columns
164 check_implicit = True
165 elif self.k_constant == 0:
166 check_implicit = True
167 else:
168 # should not be here
169 pass
171 if check_implicit and not hasconst:
172 # look for implicit constant
173 # Compute rank of augmented matrix
174 augmented_exog = np.column_stack(
175 (np.ones(self.exog.shape[0]), self.exog))
176 rank_augm = np.linalg.matrix_rank(augmented_exog)
177 rank_orig = np.linalg.matrix_rank(self.exog)
178 self.k_constant = int(rank_orig == rank_augm)
179 self.const_idx = None
180 elif hasconst:
181 # Ensure k_constant is 1 any time hasconst is True
182 # even if one is not found
183 self.k_constant = 1
185 @classmethod
186 def _drop_nans(cls, x, nan_mask):
187 return x[nan_mask]
189 @classmethod
190 def _drop_nans_2d(cls, x, nan_mask):
191 return x[nan_mask][:, nan_mask]
193 @classmethod
194 def handle_missing(cls, endog, exog, missing, **kwargs):
195 """
196 This returns a dictionary with keys endog, exog and the keys of
197 kwargs. It preserves Nones.
198 """
199 none_array_names = []
201 # patsy's already dropped NaNs in y/X
202 missing_idx = kwargs.pop('missing_idx', None)
204 if missing_idx is not None:
205 # y, X already handled by patsy. add back in later.
206 combined = ()
207 combined_names = []
208 if exog is None:
209 none_array_names += ['exog']
210 elif exog is not None:
211 combined = (endog, exog)
212 combined_names = ['endog', 'exog']
213 else:
214 combined = (endog,)
215 combined_names = ['endog']
216 none_array_names += ['exog']
218 # deal with other arrays
219 combined_2d = ()
220 combined_2d_names = []
221 if len(kwargs):
222 for key, value_array in iteritems(kwargs):
223 if value_array is None or value_array.ndim == 0:
224 none_array_names += [key]
225 continue
226 # grab 1d arrays
227 if value_array.ndim == 1:
228 combined += (np.asarray(value_array),)
229 combined_names += [key]
230 elif value_array.squeeze().ndim == 1:
231 combined += (np.asarray(value_array),)
232 combined_names += [key]
234 # grab 2d arrays that are _assumed_ to be symmetric
235 elif value_array.ndim == 2:
236 combined_2d += (np.asarray(value_array),)
237 combined_2d_names += [key]
238 else:
239 raise ValueError("Arrays with more than 2 dimensions "
240 "are not yet handled")
242 if missing_idx is not None:
243 nan_mask = missing_idx
244 updated_row_mask = None
245 if combined: # there were extra arrays not handled by patsy
246 combined_nans = _nan_rows(*combined)
247 if combined_nans.shape[0] != nan_mask.shape[0]:
248 raise ValueError("Shape mismatch between endog/exog "
249 "and extra arrays given to model.")
250 # for going back and updated endog/exog
251 updated_row_mask = combined_nans[~nan_mask]
252 nan_mask |= combined_nans # for updating extra arrays only
253 if combined_2d:
254 combined_2d_nans = _nan_rows(combined_2d)
255 if combined_2d_nans.shape[0] != nan_mask.shape[0]:
256 raise ValueError("Shape mismatch between endog/exog "
257 "and extra 2d arrays given to model.")
258 if updated_row_mask is not None:
259 updated_row_mask |= combined_2d_nans[~nan_mask]
260 else:
261 updated_row_mask = combined_2d_nans[~nan_mask]
262 nan_mask |= combined_2d_nans
264 else:
265 nan_mask = _nan_rows(*combined)
266 if combined_2d:
267 nan_mask = _nan_rows(*(nan_mask[:, None],) + combined_2d)
269 if not np.any(nan_mask): # no missing do not do anything
270 combined = dict(zip(combined_names, combined))
271 if combined_2d:
272 combined.update(dict(zip(combined_2d_names, combined_2d)))
273 if none_array_names:
274 combined.update(dict(zip(none_array_names,
275 [None] * len(none_array_names))))
277 if missing_idx is not None:
278 combined.update({'endog': endog})
279 if exog is not None:
280 combined.update({'exog': exog})
282 return combined, []
284 elif missing == 'raise':
285 raise MissingDataError("NaNs were encountered in the data")
287 elif missing == 'drop':
288 nan_mask = ~nan_mask
289 drop_nans = lambda x: cls._drop_nans(x, nan_mask)
290 drop_nans_2d = lambda x: cls._drop_nans_2d(x, nan_mask)
291 combined = dict(zip(combined_names, lmap(drop_nans, combined)))
293 if missing_idx is not None:
294 if updated_row_mask is not None:
295 updated_row_mask = ~updated_row_mask
296 # update endog/exog with this new information
297 endog = cls._drop_nans(endog, updated_row_mask)
298 if exog is not None:
299 exog = cls._drop_nans(exog, updated_row_mask)
301 combined.update({'endog': endog})
302 if exog is not None:
303 combined.update({'exog': exog})
305 if combined_2d:
306 combined.update(dict(zip(combined_2d_names,
307 lmap(drop_nans_2d, combined_2d))))
308 if none_array_names:
309 combined.update(dict(zip(none_array_names,
310 [None] * len(none_array_names))))
312 return combined, np.where(~nan_mask)[0].tolist()
313 else:
314 raise ValueError("missing option %s not understood" % missing)
316 def _convert_endog_exog(self, endog, exog):
318 # for consistent outputs if endog is (n,1)
319 yarr = self._get_yarr(endog)
320 xarr = None
321 if exog is not None:
322 xarr = self._get_xarr(exog)
323 if xarr.ndim == 1:
324 xarr = xarr[:, None]
325 if xarr.ndim != 2:
326 raise ValueError("exog is not 1d or 2d")
328 return yarr, xarr
330 @cache_writable()
331 def ynames(self):
332 endog = self.orig_endog
333 ynames = self._get_names(endog)
334 if not ynames:
335 ynames = _make_endog_names(self.endog)
337 if len(ynames) == 1:
338 return ynames[0]
339 else:
340 return list(ynames)
342 @cache_writable()
343 def xnames(self):
344 exog = self.orig_exog
345 if exog is not None:
346 xnames = self._get_names(exog)
347 if not xnames:
348 xnames = _make_exog_names(self.exog)
349 return list(xnames)
350 return None
352 @property
353 def param_names(self):
354 # for handling names of 'extra' parameters in summary, etc.
355 return self._param_names or self.xnames
357 @param_names.setter
358 def param_names(self, values):
359 self._param_names = values
361 @property
362 def cov_names(self):
363 """
364 Labels for covariance matrices
366 In multidimensional models, each dimension of a covariance matrix
367 differs from the number of param_names.
369 If not set, returns param_names
370 """
371 # for handling names of covariance names in multidimensional models
372 if self._cov_names is not None:
373 return self._cov_names
374 return self.param_names
376 @cov_names.setter
377 def cov_names(self, value):
378 # for handling names of covariance names in multidimensional models
379 self._cov_names = value
381 @cache_readonly
382 def row_labels(self):
383 exog = self.orig_exog
384 if exog is not None:
385 row_labels = self._get_row_labels(exog)
386 else:
387 endog = self.orig_endog
388 row_labels = self._get_row_labels(endog)
389 return row_labels
391 def _get_row_labels(self, arr):
392 return None
394 def _get_names(self, arr):
395 if isinstance(arr, DataFrame):
396 if isinstance(arr.columns, MultiIndex):
397 # Flatten MultiIndexes into "simple" column names
398 return ['_'.join((level for level in c if level))
399 for c in arr.columns]
400 else:
401 return list(arr.columns)
402 elif isinstance(arr, Series):
403 if arr.name:
404 return [arr.name]
405 else:
406 return
407 else:
408 try:
409 return arr.dtype.names
410 except AttributeError:
411 pass
413 return None
415 def _get_yarr(self, endog):
416 if data_util._is_structured_ndarray(endog):
417 endog = data_util.struct_to_ndarray(endog)
418 endog = np.asarray(endog)
419 if len(endog) == 1: # never squeeze to a scalar
420 if endog.ndim == 1:
421 return endog
422 elif endog.ndim > 1:
423 return np.asarray([endog.squeeze()])
425 return endog.squeeze()
427 def _get_xarr(self, exog):
428 if data_util._is_structured_ndarray(exog):
429 exog = data_util.struct_to_ndarray(exog)
430 return np.asarray(exog)
432 def _check_integrity(self):
433 if self.exog is not None:
434 if len(self.exog) != len(self.endog):
435 raise ValueError("endog and exog matrices are different sizes")
437 def wrap_output(self, obj, how='columns', names=None):
438 if how == 'columns':
439 return self.attach_columns(obj)
440 elif how == 'rows':
441 return self.attach_rows(obj)
442 elif how == 'cov':
443 return self.attach_cov(obj)
444 elif how == 'dates':
445 return self.attach_dates(obj)
446 elif how == 'columns_eq':
447 return self.attach_columns_eq(obj)
448 elif how == 'cov_eq':
449 return self.attach_cov_eq(obj)
450 elif how == 'generic_columns':
451 return self.attach_generic_columns(obj, names)
452 elif how == 'generic_columns_2d':
453 return self.attach_generic_columns_2d(obj, names)
454 elif how == 'ynames':
455 return self.attach_ynames(obj)
456 elif how == 'multivariate_confint':
457 return self.attach_mv_confint(obj)
458 else:
459 return obj
461 def attach_columns(self, result):
462 return result
464 def attach_columns_eq(self, result):
465 return result
467 def attach_cov(self, result):
468 return result
470 def attach_cov_eq(self, result):
471 return result
473 def attach_rows(self, result):
474 return result
476 def attach_dates(self, result):
477 return result
479 def attach_mv_confint(self, result):
480 return result
482 def attach_generic_columns(self, result, *args, **kwargs):
483 return result
485 def attach_generic_columns_2d(self, result, *args, **kwargs):
486 return result
488 def attach_ynames(self, result):
489 return result
492class PatsyData(ModelData):
493 def _get_names(self, arr):
494 return arr.design_info.column_names
497class PandasData(ModelData):
498 """
499 Data handling class which knows how to reattach pandas metadata to model
500 results
501 """
503 def _convert_endog_exog(self, endog, exog=None):
504 #TODO: remove this when we handle dtype systematically
505 endog = np.asarray(endog)
506 exog = exog if exog is None else np.asarray(exog)
507 if endog.dtype == object or exog is not None and exog.dtype == object:
508 raise ValueError("Pandas data cast to numpy dtype of object. "
509 "Check input data with np.asarray(data).")
510 return super(PandasData, self)._convert_endog_exog(endog, exog)
512 @classmethod
513 def _drop_nans(cls, x, nan_mask):
514 if isinstance(x, (Series, DataFrame)):
515 return x.loc[nan_mask]
516 else: # extra arguments could be plain ndarrays
517 return super(PandasData, cls)._drop_nans(x, nan_mask)
519 @classmethod
520 def _drop_nans_2d(cls, x, nan_mask):
521 if isinstance(x, (Series, DataFrame)):
522 return x.loc[nan_mask].loc[:, nan_mask]
523 else: # extra arguments could be plain ndarrays
524 return super(PandasData, cls)._drop_nans_2d(x, nan_mask)
526 def _check_integrity(self):
527 endog, exog = self.orig_endog, self.orig_exog
528 # exog can be None and we could be upcasting one or the other
529 if (exog is not None and
530 (hasattr(endog, 'index') and hasattr(exog, 'index')) and
531 not self.orig_endog.index.equals(self.orig_exog.index)):
532 raise ValueError("The indices for endog and exog are not aligned")
533 super(PandasData, self)._check_integrity()
535 def _get_row_labels(self, arr):
536 try:
537 return arr.index
538 except AttributeError:
539 # if we've gotten here it's because endog is pandas and
540 # exog is not, so just return the row labels from endog
541 return self.orig_endog.index
543 def attach_generic_columns(self, result, names):
544 # get the attribute to use
545 column_names = getattr(self, names, None)
546 return Series(result, index=column_names)
548 def attach_generic_columns_2d(self, result, rownames, colnames=None):
549 colnames = colnames or rownames
550 rownames = getattr(self, rownames, None)
551 colnames = getattr(self, colnames, None)
552 return DataFrame(result, index=rownames, columns=colnames)
554 def attach_columns(self, result):
555 # this can either be a 1d array or a scalar
556 # do not squeeze because it might be a 2d row array
557 # if it needs a squeeze, the bug is elsewhere
558 if result.ndim <= 1:
559 return Series(result, index=self.param_names)
560 else: # for e.g., confidence intervals
561 return DataFrame(result, index=self.param_names)
563 def attach_columns_eq(self, result):
564 return DataFrame(result, index=self.xnames, columns=self.ynames)
566 def attach_cov(self, result):
567 return DataFrame(result, index=self.cov_names, columns=self.cov_names)
569 def attach_cov_eq(self, result):
570 return DataFrame(result, index=self.ynames, columns=self.ynames)
572 def attach_rows(self, result):
573 # assumes if len(row_labels) > len(result) it's bc it was truncated
574 # at the front, for AR lags, for example
575 squeezed = result.squeeze()
576 k_endog = np.array(self.ynames, ndmin=1).shape[0]
577 if k_endog > 1 and squeezed.shape == (k_endog,):
578 squeezed = squeezed[None, :]
579 # May be zero-dim, for example in the case of forecast one step in tsa
580 if squeezed.ndim < 2:
581 return Series(squeezed, index=self.row_labels[-len(result):])
582 else:
583 return DataFrame(result, index=self.row_labels[-len(result):],
584 columns=self.ynames)
586 def attach_dates(self, result):
587 squeezed = result.squeeze()
588 k_endog = np.array(self.ynames, ndmin=1).shape[0]
589 if k_endog > 1 and squeezed.shape == (k_endog,):
590 squeezed = np.asarray(squeezed)[None, :]
591 # May be zero-dim, for example in the case of forecast one step in tsa
592 if squeezed.ndim < 2:
593 return Series(squeezed, index=self.predict_dates)
594 else:
595 return DataFrame(result, index=self.predict_dates,
596 columns=self.ynames)
598 def attach_mv_confint(self, result):
599 return DataFrame(result.reshape((-1, 2)),
600 index=self.cov_names,
601 columns=['lower', 'upper'])
603 def attach_ynames(self, result):
604 squeezed = result.squeeze()
605 # May be zero-dim, for example in the case of forecast one step in tsa
606 if squeezed.ndim < 2:
607 return Series(squeezed, name=self.ynames)
608 else:
609 return DataFrame(result, columns=self.ynames)
612def _make_endog_names(endog):
613 if endog.ndim == 1 or endog.shape[1] == 1:
614 ynames = ['y']
615 else: # for VAR
616 ynames = ['y%d' % (i+1) for i in range(endog.shape[1])]
618 return ynames
621def _make_exog_names(exog):
622 exog_var = exog.var(0)
623 if (exog_var == 0).any():
624 # assumes one constant in first or last position
625 # avoid exception if more than one constant
626 const_idx = exog_var.argmin()
627 exog_names = ['x%d' % i for i in range(1, exog.shape[1])]
628 exog_names.insert(const_idx, 'const')
629 else:
630 exog_names = ['x%d' % i for i in range(1, exog.shape[1]+1)]
632 return exog_names
635def handle_missing(endog, exog=None, missing='none', **kwargs):
636 klass = handle_data_class_factory(endog, exog)
637 if missing == 'none':
638 ret_dict = dict(endog=endog, exog=exog)
639 ret_dict.update(kwargs)
640 return ret_dict, None
641 return klass.handle_missing(endog, exog, missing=missing, **kwargs)
644def handle_data_class_factory(endog, exog):
645 """
646 Given inputs
647 """
648 if data_util._is_using_ndarray_type(endog, exog):
649 klass = ModelData
650 elif data_util._is_using_pandas(endog, exog):
651 klass = PandasData
652 elif data_util._is_using_patsy(endog, exog):
653 klass = PatsyData
654 # keep this check last
655 elif data_util._is_using_ndarray(endog, exog):
656 klass = ModelData
657 else:
658 raise ValueError('unrecognized data structures: %s / %s' %
659 (type(endog), type(exog)))
660 return klass
663def handle_data(endog, exog, missing='none', hasconst=None, **kwargs):
664 # deal with lists and tuples up-front
665 if isinstance(endog, (list, tuple)):
666 endog = np.asarray(endog)
667 if isinstance(exog, (list, tuple)):
668 exog = np.asarray(exog)
670 klass = handle_data_class_factory(endog, exog)
671 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
672 **kwargs)